Skip to content

Instantly share code, notes, and snippets.

@jaanus
Created October 24, 2018 07:54
Show Gist options
  • Save jaanus/757eb831035214590ba9befd22cb7e04 to your computer and use it in GitHub Desktop.
Save jaanus/757eb831035214590ba9befd22cb7e04 to your computer and use it in GitHub Desktop.
Given a list of files, compute their SHA256 checksums.
#!/usr/bin/env python3
"""For a list of files, get the checksums for the rows that don’t already have it.
Run 'find . -type f > folder.tsv' to get the list of files, and then pass that file
as an argument to this script.
"""
import sys
import argparse
import csv
import hashlib
import operator
def fileChecksum(path):
sha = hashlib.sha256()
BLOCKSIZE = 1048576
try:
with open(path, 'rb') as infile:
file_buffer = infile.read(BLOCKSIZE)
while len(file_buffer) > 0:
sha.update(file_buffer)
file_buffer = infile.read(BLOCKSIZE)
return sha.hexdigest()
except FileNotFoundError:
return ''
def main(arguments):
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-i', '--infile', help="Input/output file in TSV format (filename and checksum)", required=True, type=argparse.FileType('r'))
args = parser.parse_args(arguments)
reader = csv.reader(args.infile, delimiter="\t", quotechar="\"")
files = []
# read the list of files into memory
for row in reader:
try:
filename, hash = row
except ValueError:
filename = row[0]
hash = ''
files.append([filename, hash])
fileCount = len(files)
# iterate over the files, checksum the ones that didn’t already have a checksum in the input file
for index, file in enumerate(files):
filename = file[0]
hash = file[1]
status = "Previous checksum"
if len(hash) == 0:
hash = fileChecksum(filename)
if len(hash) > 0:
status = "Calculated checksum"
else:
status = "Error calculating checksum"
files[index] = [filename, hash]
# clear to end of line
sys.stdout.write("\033[K")
# print the file count and latest status
print("{} / {} {}".format(index+1, fileCount, status), end = "\r")
# Write results to an output file
with open(args.infile.name + "-out", "w+", newline='') as f:
writer = csv.writer(f, delimiter="\t")
files.sort(key = operator.itemgetter(1, 0)) # first sort by hash, then by filename
writer.writerows(files)
print("\nDone.")
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment