Python搜寻重覆档案 hash-58码农网

搭配 os.walk + hash 搜寻子目录下之重覆档案(图档为例) Source Code download
for 档案类型搜寻在上一篇

import os, hashlib#--- 找出重覆之档案 # 参数 nPath 资料夹 / fTypes 要搜寻的类型def findOverlap( nPath, fTypes ):    allimage = []    allhsh  = dict()   # key: hash / value: filePath    overlapA = []      # 重覆之档 位置A    overlapB = []      # 重覆之档 位置B    f_tree = os.walk(nPath)    # os.walk 传回的是generator    print(f'return a generator: {type(f_tree)}')        for dirname,subdir,files in f_tree:        # 一层一层向下        print(f'file count of this folder: {len(files)}')        imgFiles = []    # 这一层的 image files        # 取得 符合之档案，存入 imgFiles 串列中        for file in files:              ext = file.split('.')[-1]            if ext in filetypes:                tmp = dirname +'/'+file                imgFiles.append(tmp)                allimage.append(tmp)              # 如果这一层有符合档案         if len(imgFiles) > 0:            #--- 逐一检查，如果发现新来之档hash已存在，则加入overlap             for img in imgFiles:                imghsh = hashlib.md5(open(img,'rb').read()).digest()                fname = os.path.abspath(img)                if imghsh in allhsh:                    overlapA.append(fname)                    overlapB.append(allhsh[imghsh]) #B位置放入已有hash值之档                else:  # else 增添入 hash dict 中                    allhsh[imghsh] = fname    return allimage, overlapA, overlapB

#--- 流程 主轴 -----# 指定搜寻之目录 (或者预设为当前目录)pathHere = os.getcwd() # 当前目录位置path = input('从哪个资料夹 开始搜寻 ? ') or pathHereprint(f'搜寻资料夹: {path} (含子目录)图档')# 要筛选的档案类型filetypes = ['jpg', 'png', 'bmp', 'jpeg']  iFile, overA, overB = findOverlap( path, filetypes )print(f'图档数量: {len(iFile)} 重覆者: {len(overA)}')if len(overA) != 0:    print("找到下列重覆的档案：")    for i in range(len(overA)):        print(f'位置A: {overA[i]}\n位置B: {overB[i]}')

# 把结果存档f = open( pathHere+'\overlap.txt','w',encoding='utf-8' )print(f'图档数量: {len(iFile)} 重覆者: {len(overA)}',file=f)print("找到下列重覆的档案：",file=f)for i in range(len(overA)):    print(f'位置A: {overA[i]}\n位置B: {overB[i]}\n',file=f)f.close()

给这篇文章的作者打赏

关于作者: 网站小编

相关文章

HBO Max vs.Netflix：当你负担不起两者时如何选择

课内笔记整理---作业系统实务(资安相关篇)

excel vba捞网页数据问题

热门文章

1Python搜寻重覆档案 hash

2From mud to Structure

3meownaori1630

4Power BI Course in Bangalore

5服务自动化中提高效率真的好吗？