Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Read file paths, names, hashes into a data frame
from fastai2.vision.all import * # to get L
import pandas as pd
def readMD5file(md5path:Path) -> pd.DataFrame:
"""
Generate MD5 output file by doing a search like:
find /home/jupyter/data/foldersToAdd/ -iname '*jpg' -print0 | xargs -0 -n 100 md5sum >> /home/jupyter/data/foldersToAdd.md5.out
Then read it with this to make a dataframe to check for name uniqueness, path uniqueness, etc..
"""
with open(str(md5path),'r') as f:
lines = L(f.read().split('\n')).map(lambda line:tuple(line.split(' '))).filter(lambda t: len(t) == 2)
lines.sort()
dff = pd.DataFrame(list(lines),columns=['hash','path'])
dff['fname'] = dff['path'].map(lambda p: Path(p).parts[-1])
return dff
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment