Skip to content

Instantly share code, notes, and snippets.

@ecashin
Created July 31, 2014 02:31
Show Gist options
  • Save ecashin/96dc9c3183e6e98db2fd to your computer and use it in GitHub Desktop.
Save ecashin/96dc9c3183e6e98db2fd to your computer and use it in GitHub Desktop.
Process tar files in parallel
#! /usr/bin/env python2.7
from multiprocessing import Pool, Queue
import os
from sys import argv
import tarfile
BLACKLIST = ['.git']
EXTENSIONS = ['.tgz', '.tar', '.tar.gz', '.tar.bz2']
N_PARALLEL = 10
def extract_goodness(directory, basename):
if basename.startswith('bob'):
print 'd({}) b({})'.format(directory, basename)
with tarfile.open(os.path.join(directory, basename), 'r') as tar:
for tinfo in tar:
print tinfo.name
def main():
if len(argv) < 2:
return
pool = Pool(N_PARALLEL)
for start_dir in argv[1:]:
for root, dirs, files in os.walk(start_dir):
for skipdir in BLACKLIST:
try:
dirs.remove(skipdir)
except ValueError:
pass
for fnam in files:
for ext in EXTENSIONS:
if fnam.endswith(ext):
pool.apply_async(
extract_goodness,
(root, fnam))
pool.close()
pool.join()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment