Created
November 2, 2018 17:07
-
-
Save philerooski/ee9105026e50e17e8f5f002614d99b57 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import argparse | |
import hashlib | |
import re | |
import os | |
def read_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("target_dir", | |
help="Directory containing files to compute md5 of.") | |
args = parser.parse_args() | |
return(args) | |
def get_files(target_dir): | |
all_files = pd.DataFrame(columns = ["path", "basename"]) | |
for root, dirs, files in os.walk(target_dir): | |
for f in files: | |
row = pd.DataFrame({"path": [os.path.join(root, f)], | |
"basename": [f]}) | |
all_files = all_files.append(row, ignore_index = True) | |
return all_files | |
def md5sum(filename, blocksize=65536): | |
hash = hashlib.md5() | |
with open(filename, "rb") as f: | |
for block in iter(lambda: f.read(blocksize), b""): | |
hash.update(block) | |
return hash.hexdigest() | |
def compute_md5(all_files): | |
md5s = all_files.path.apply(md5sum) | |
return md5s | |
def main(): | |
args = read_args() | |
output_file_name = re.sub(r"/", "_", args.target_dir) | |
all_files = get_files(args.target_dir) | |
md5s = compute_md5(all_files) | |
all_files['md5'] = md5s | |
all_files.to_csv(output_file_name + "_md5.csv", index = False) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment