Python搜寻重覆档案 hash

搭配 os.walk + hash 搜寻子目录下之重覆档案(图档为例) Source Code download
for 档案类型搜寻 在上一篇

import os, hashlib#--- 找出重覆之档案 # 参数 nPath 资料夹 / fTypes 要搜寻的类型def findOverlap( nPath, fTypes ):    allimage = []    allhsh  = dict()   # key: hash / value: filePath    overlapA = []      # 重覆之档 位置A    overlapB = []      # 重覆之档 位置B    f_tree = os.walk(nPath)    # os.walk 传回的是generator    print(f'return a generator: {type(f_tree)}')        for dirname,subdir,files in f_tree:        # 一层一层向下        print(f'file count of this folder: {len(files)}')        imgFiles = []    # 这一层的 image files        # 取得 符合之档案,存入 imgFiles 串列中        for file in files:              ext = file.split('.')[-1]            if ext in filetypes:                tmp = dirname +'/'+file                imgFiles.append(tmp)                allimage.append(tmp)              # 如果这一层有符合档案         if len(imgFiles) > 0:            #--- 逐一检查,如果发现新来之档hash已存在,则加入overlap             for img in imgFiles:                imghsh = hashlib.md5(open(img,'rb').read()).digest()                fname = os.path.abspath(img)                if imghsh in allhsh:                    overlapA.append(fname)                    overlapB.append(allhsh[imghsh]) #B位置放入已有hash值之档                else:  # else 增添入 hash dict 中                    allhsh[imghsh] = fname    return allimage, overlapA, overlapB
#--- 流程 主轴 -----# 指定搜寻之目录 (或者预设为当前目录)pathHere = os.getcwd() # 当前目录位置path = input('从哪个资料夹 开始搜寻 ? ') or pathHereprint(f'搜寻资料夹: {path} (含子目录)图档')# 要筛选的档案类型filetypes = ['jpg', 'png', 'bmp', 'jpeg']  iFile, overA, overB = findOverlap( path, filetypes )print(f'图档数量: {len(iFile)} 重覆者: {len(overA)}')if len(overA) != 0:    print("找到下列重覆的档案:")    for i in range(len(overA)):        print(f'位置A: {overA[i]}\n位置B: {overB[i]}')
# 把结果存档f = open( pathHere+'\overlap.txt','w',encoding='utf-8' )print(f'图档数量: {len(iFile)} 重覆者: {len(overA)}',file=f)print("找到下列重覆的档案:",file=f)for i in range(len(overA)):    print(f'位置A: {overA[i]}\n位置B: {overB[i]}\n',file=f)f.close()

关于作者: 网站小编

码农网专注IT技术教程资源分享平台,学习资源下载网站,58码农网包含计算机技术、网站程序源码下载、编程技术论坛、互联网资源下载等产品服务,提供原创、优质、完整内容的专业码农交流分享平台。

热门文章