Skip to content

Instantly share code, notes, and snippets.

@kgjenkins
Created May 3, 2024 17:45
Show Gist options
  • Save kgjenkins/ba5868ebb3d99786b8d079414ad9ba85 to your computer and use it in GitHub Desktop.
Save kgjenkins/ba5868ebb3d99786b8d079414ad9ba85 to your computer and use it in GitHub Desktop.
generate hashes for a csv of filepaths
# this script goes through a list of files from Excel "data from folder"
# and computes a hash of the file contents (to be used for deduplication)
import csv
import os
import hashlib
import time
# columns: Name,Extension,Date accessed,Date modified,Date created,Folder Path,Hash
infile = 'test-files.csv'
t = time.perf_counter()
outfile = infile.replace('.csv', '-hashes.csv')
out = open(outfile, mode='w')
out.write('Name,Extension,Date accessed,Date modified,Date created,Folder Path,Hash\n')
with open(infile, mode='r', encoding='utf-8-sig', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
filepath = os.path.join(row['Folder Path'], row['Name'])
if filepath.find('boundaries') == -1:
continue
print(filepath)
with open(filepath, 'rb') as f:
h = hashlib.md5()
while True:
data = f.read(65536)
if not data:
break
h.update(data)
row['hash'] = h.hexdigest()
out.write(','.join(row.values()) + '\n')
out.close()
print(time.perf_counter() - t, 'seconds')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment