Helw150/large-file-processing.py

## large-file-processing.py
#!/usr/bin/env python
"""Counts the number of times a word occurs in a very large text file"""

from __future__ import print_function
import os
import sys
import argparse
import textacy
import multiprocessing
from tqdm import tqdm
from collections import Counter
from itertools import zip_longest


def grouper(n, iterable, padvalue=None):
    return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue)


def process_chunk(chunk):
    if chunk:
        # Roughly the overhead of a Python String
        size = sys.getsizeof(chunk) - 40
        wordcount = Counter(chunk.split())
        return (wordcount, size)
    else:
        return (None, 0)


def main(arguments):

    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'infile', help="Input file", type=argparse.FileType('r'))
    parser.add_argument(
        'outfile', help="Output file", type=argparse.FileType('w'))

    args = parser.parse_args(arguments)
    p = multiprocessing.Pool(8)

    filesize = os.path.getsize(args.infile.name)

    total_word_counts = None
    with tqdm(total=filesize) as pbar:
        for chunk in grouper(1000, args.infile):
            results = p.map(process_chunk, chunk)
            word_counts, sizes = zip(*results)
            pbar.update(sum(sizes))
            chunk_wc = sum(word_counts, Counter())
            if total_word_counts != None:
                total_word_counts += chunk_wc
            else:
                total_word_counts = chunk_wc

    for item in total_word_counts.items():
        args.outfile.write("{} {}\n".format(*item))


if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
	#!/usr/bin/env python
	"""Counts the number of times a word occurs in a very large text file"""

	from __future__ import print_function
	import os
	import sys
	import argparse
	import textacy
	import multiprocessing
	from tqdm import tqdm
	from collections import Counter
	from itertools import zip_longest


	def grouper(n, iterable, padvalue=None):
	return zip_longest([iter(iterable)] n, fillvalue=padvalue)


	def process_chunk(chunk):
	if chunk:
	# Roughly the overhead of a Python String
	size = sys.getsizeof(chunk) - 40
	wordcount = Counter(chunk.split())
	return (wordcount, size)
	else:
	return (None, 0)


	def main(arguments):

	parser = argparse.ArgumentParser(
	description=__doc__,
	formatter_class=argparse.RawDescriptionHelpFormatter)
	parser.add_argument(
	'infile', help="Input file", type=argparse.FileType('r'))
	parser.add_argument(
	'outfile', help="Output file", type=argparse.FileType('w'))

	args = parser.parse_args(arguments)
	p = multiprocessing.Pool(8)

	filesize = os.path.getsize(args.infile.name)

	total_word_counts = None
	with tqdm(total=filesize) as pbar:
	for chunk in grouper(1000, args.infile):
	results = p.map(process_chunk, chunk)
	word_counts, sizes = zip(*results)
	pbar.update(sum(sizes))
	chunk_wc = sum(word_counts, Counter())
	if total_word_counts != None:
	total_word_counts += chunk_wc
	else:
	total_word_counts = chunk_wc

	for item in total_word_counts.items():
	args.outfile.write("{} {}\n".format(*item))


	if __name__ == '__main__':
	sys.exit(main(sys.argv[1:]))