Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import pandas as pd
import argparse
import hashlib
import re
import os
def read_args():
parser = argparse.ArgumentParser()
parser.add_argument("target_dir",
help="Directory containing files to compute md5 of.")
args = parser.parse_args()
return(args)
def get_files(target_dir):
all_files = pd.DataFrame(columns = ["path", "basename"])
for root, dirs, files in os.walk(target_dir):
for f in files:
row = pd.DataFrame({"path": [os.path.join(root, f)],
"basename": [f]})
all_files = all_files.append(row, ignore_index = True)
return all_files
def md5sum(filename, blocksize=65536):
hash = hashlib.md5()
with open(filename, "rb") as f:
for block in iter(lambda: f.read(blocksize), b""):
hash.update(block)
return hash.hexdigest()
def compute_md5(all_files):
md5s = all_files.path.apply(md5sum)
return md5s
def main():
args = read_args()
output_file_name = re.sub(r"/", "_", args.target_dir)
all_files = get_files(args.target_dir)
md5s = compute_md5(all_files)
all_files['md5'] = md5s
all_files.to_csv(output_file_name + "_md5.csv", index = False)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.