Skip to content

Instantly share code, notes, and snippets.

@Helw150
Last active July 28, 2018 22:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Helw150/51c5f78f920e659866a295e283a8883f to your computer and use it in GitHub Desktop.
Save Helw150/51c5f78f920e659866a295e283a8883f to your computer and use it in GitHub Desktop.
A Python Script which multi-processes large files with a rough progress bar
#!/usr/bin/env python
"""Counts the number of times a word occurs in a very large text file"""
from __future__ import print_function
import os
import sys
import argparse
import textacy
import multiprocessing
from tqdm import tqdm
from collections import Counter
from itertools import zip_longest
def grouper(n, iterable, padvalue=None):
return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue)
def process_chunk(chunk):
if chunk:
# Roughly the overhead of a Python String
size = sys.getsizeof(chunk) - 40
wordcount = Counter(chunk.split())
return (wordcount, size)
else:
return (None, 0)
def main(arguments):
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(
'infile', help="Input file", type=argparse.FileType('r'))
parser.add_argument(
'outfile', help="Output file", type=argparse.FileType('w'))
args = parser.parse_args(arguments)
p = multiprocessing.Pool(8)
filesize = os.path.getsize(args.infile.name)
total_word_counts = None
with tqdm(total=filesize) as pbar:
for chunk in grouper(1000, args.infile):
results = p.map(process_chunk, chunk)
word_counts, sizes = zip(*results)
pbar.update(sum(sizes))
chunk_wc = sum(word_counts, Counter())
if total_word_counts != None:
total_word_counts += chunk_wc
else:
total_word_counts = chunk_wc
for item in total_word_counts.items():
args.outfile.write("{} {}\n".format(*item))
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment