Skip to content

Instantly share code, notes, and snippets.

@e3krisztian
Created November 26, 2013 14:07
Show Gist options
  • Save e3krisztian/7658817 to your computer and use it in GitHub Desktop.
Save e3krisztian/7658817 to your computer and use it in GitHub Desktop.
sort large amount of files into buckets by extracting the bucket name from file names with regexp, used to process ~500k files in a 4 directories
'''
sort-to-buckets '/dest/files-{}.lst' 'bucket-selector-python-regexp'
->
files:
/dest/files-1st-bucket.lst
...
/dest/files-last-bucket.lst
'''
import os
import re
import sys
from collections import defaultdict
OUTPUT_FORMAT, BUCKET_RE = sys.argv[1:]
find_bucket = re.compile(BUCKET_RE).search
def get_filenames():
for dirpath, dirnames, filenames in os.walk('.'):
for filename in filenames:
yield os.path.join(dirpath, filename)
all_filenames = list(get_filenames())
buckets = defaultdict(list)
# sort files into buckets
for filename in all_filenames:
m = find_bucket(filename)
if m:
bucket = m.group(1)
buckets[bucket].append(filename)
# write out buckets
for bucket in buckets:
with open(OUTPUT_FORMAT.format(bucket), 'w') as output:
for filename in sorted(buckets[bucket]):
output.write(filename)
output.write('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment