Created
May 3, 2024 17:45
-
-
Save kgjenkins/ba5868ebb3d99786b8d079414ad9ba85 to your computer and use it in GitHub Desktop.
generate hashes for a csv of filepaths
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this script goes through a list of files from Excel "data from folder" | |
# and computes a hash of the file contents (to be used for deduplication) | |
import csv | |
import os | |
import hashlib | |
import time | |
# columns: Name,Extension,Date accessed,Date modified,Date created,Folder Path,Hash | |
infile = 'test-files.csv' | |
t = time.perf_counter() | |
outfile = infile.replace('.csv', '-hashes.csv') | |
out = open(outfile, mode='w') | |
out.write('Name,Extension,Date accessed,Date modified,Date created,Folder Path,Hash\n') | |
with open(infile, mode='r', encoding='utf-8-sig', newline='') as csvfile: | |
reader = csv.DictReader(csvfile) | |
for row in reader: | |
filepath = os.path.join(row['Folder Path'], row['Name']) | |
if filepath.find('boundaries') == -1: | |
continue | |
print(filepath) | |
with open(filepath, 'rb') as f: | |
h = hashlib.md5() | |
while True: | |
data = f.read(65536) | |
if not data: | |
break | |
h.update(data) | |
row['hash'] = h.hexdigest() | |
out.write(','.join(row.values()) + '\n') | |
out.close() | |
print(time.perf_counter() - t, 'seconds') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment