Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
#!/usr/bin/env python
""" sorts lines (or tab-sep records) by md5. (e.g. for train/test splits).
optionally prepends with the md5 id too.
brendan o'connor - - """
import hashlib,sys,optparse
p = optparse.OptionParser()
p.add_option('-k', type='int', default=False)
p.add_option('-p', action='store_true')
lines = sys.stdin.readlines()
getter=lambda s: hashlib.md5(s[:-1]).hexdigest()
if opts.k:
getter=lambda s: hashlib.md5(s[:-1].split("\t")[opts.k-1]).hexdigest()
lines.sort(key=lambda s: getter(s))
for line in lines:
if opts.p: line = getter(line) + "\t" + line
print line,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment