kgjenkins/hash.py

## hash.py
# this script goes through a list of files from Excel "data from folder"
# and computes a hash of the file contents (to be used for deduplication)

import csv
import os
import hashlib
import time

# columns: Name,Extension,Date accessed,Date modified,Date created,Folder Path,Hash

infile = 'test-files.csv'


t = time.perf_counter()
outfile = infile.replace('.csv', '-hashes.csv')
out = open(outfile, mode='w')
out.write('Name,Extension,Date accessed,Date modified,Date created,Folder Path,Hash\n')

with open(infile, mode='r', encoding='utf-8-sig', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        filepath = os.path.join(row['Folder Path'], row['Name'])
        if filepath.find('boundaries') == -1:
            continue
        print(filepath)
        with open(filepath, 'rb') as f:
            h = hashlib.md5()
            while True:
                data = f.read(65536)
                if not data:
                    break
                h.update(data)
            row['hash'] = h.hexdigest()
        out.write(','.join(row.values()) + '\n')

out.close()

print(time.perf_counter() - t, 'seconds')
	# this script goes through a list of files from Excel "data from folder"
	# and computes a hash of the file contents (to be used for deduplication)

	import csv
	import os
	import hashlib
	import time

	# columns: Name,Extension,Date accessed,Date modified,Date created,Folder Path,Hash

	infile = 'test-files.csv'


	t = time.perf_counter()
	outfile = infile.replace('.csv', '-hashes.csv')
	out = open(outfile, mode='w')
	out.write('Name,Extension,Date accessed,Date modified,Date created,Folder Path,Hash\n')

	with open(infile, mode='r', encoding='utf-8-sig', newline='') as csvfile:
	reader = csv.DictReader(csvfile)
	for row in reader:
	filepath = os.path.join(row['Folder Path'], row['Name'])
	if filepath.find('boundaries') == -1:
	continue
	print(filepath)
	with open(filepath, 'rb') as f:
	h = hashlib.md5()
	while True:
	data = f.read(65536)
	if not data:
	break
	h.update(data)
	row['hash'] = h.hexdigest()
	out.write(','.join(row.values()) + '\n')

	out.close()

	print(time.perf_counter() - t, 'seconds')