搭配 os.walk + hash 搜寻子目录下之重覆档案(图档为例) Source Code download
for 档案类型搜寻 在上一篇
import os, hashlib#--- 找出重覆之档案 # 参数 nPath 资料夹 / fTypes 要搜寻的类型def findOverlap( nPath, fTypes ): allimage = [] allhsh = dict() # key: hash / value: filePath overlapA = [] # 重覆之档 位置A overlapB = [] # 重覆之档 位置B f_tree = os.walk(nPath) # os.walk 传回的是generator print(f'return a generator: {type(f_tree)}') for dirname,subdir,files in f_tree: # 一层一层向下 print(f'file count of this folder: {len(files)}') imgFiles = [] # 这一层的 image files # 取得 符合之档案,存入 imgFiles 串列中 for file in files: ext = file.split('.')[-1] if ext in filetypes: tmp = dirname +'/'+file imgFiles.append(tmp) allimage.append(tmp) # 如果这一层有符合档案 if len(imgFiles) > 0: #--- 逐一检查,如果发现新来之档hash已存在,则加入overlap for img in imgFiles: imghsh = hashlib.md5(open(img,'rb').read()).digest() fname = os.path.abspath(img) if imghsh in allhsh: overlapA.append(fname) overlapB.append(allhsh[imghsh]) #B位置放入已有hash值之档 else: # else 增添入 hash dict 中 allhsh[imghsh] = fname return allimage, overlapA, overlapB
#--- 流程 主轴 -----# 指定搜寻之目录 (或者预设为当前目录)pathHere = os.getcwd() # 当前目录位置path = input('从哪个资料夹 开始搜寻 ? ') or pathHereprint(f'搜寻资料夹: {path} (含子目录)图档')# 要筛选的档案类型filetypes = ['jpg', 'png', 'bmp', 'jpeg'] iFile, overA, overB = findOverlap( path, filetypes )print(f'图档数量: {len(iFile)} 重覆者: {len(overA)}')if len(overA) != 0: print("找到下列重覆的档案:") for i in range(len(overA)): print(f'位置A: {overA[i]}\n位置B: {overB[i]}')
# 把结果存档f = open( pathHere+'\overlap.txt','w',encoding='utf-8' )print(f'图档数量: {len(iFile)} 重覆者: {len(overA)}',file=f)print("找到下列重覆的档案:",file=f)for i in range(len(overA)): print(f'位置A: {overA[i]}\n位置B: {overB[i]}\n',file=f)f.close()