Skip to content

Instantly share code, notes, and snippets.

@oversider-kosma
Last active October 21, 2018 15:35
Show Gist options
  • Save oversider-kosma/cc564b9774bcf904a8d5cf697f45e805 to your computer and use it in GitHub Desktop.
Save oversider-kosma/cc564b9774bcf904a8d5cf697f45e805 to your computer and use it in GitHub Desktop.
Counts the appearance of a word in a (probably large) file or stream, reading it by chunks.
#!/usr/bin/python
"""Counts the appearance of a word in a (probably large) file or stream reading it by chunks."""
import os
import sys
from time import time
DEFAULT_CHUNK_SIZE = 2048
REPORT_EVERY_SECS = 1
def _print_usage():
usage = []
usage.append('Usage: %s <file> <word> [chunk_size] [-q]\n' % sys.argv[0])
usage.append(' Counts occurrences of <word> in (probably large) <file> reading it by chunks.')
usage.append(' If <file> is "-" then stdin will be used as input.\n')
usage.append(' Options:')
usage.append(' -q\t\tto disable progress report (forced if read goes from stdin)')
print('\n'.join(usage))
def _humanreadable_time(secs):
hours = mins = 0
mins, secs = divmod(secs, 60)
hours, mins = divmod(mins, 60)
return "%02d:%02d:%02d" % (hours, mins, secs)
def _report(tell, size, start_time, occurrences):
if not size:
return
eta = speed = '???'
seconds = int(time() - start_time)
prcnt = 100. * tell / size
seconds_per_one_pcnt = seconds / prcnt if prcnt else 0
if seconds_per_one_pcnt:
eta = _humanreadable_time(int((100 - prcnt) * seconds_per_one_pcnt))
if seconds:
speed = '%0.1f' % (prcnt / seconds)
msg = '\r%0.1f%% in %s ' % (prcnt, _humanreadable_time(seconds))
msg += 'Found: %d ' % occurrences
msg += 'Avg speed: %s %%/s ' % speed
if tell != size:
msg += 'ETA: %s' % eta
msg += ' ' * (80 - len(msg))
sys.stdout.write(msg)
sys.stdout.flush()
def _main(*args):
if len(args) < 2 or set(args) & {'--help', '-h'}:
_print_usage()
exit(not set(args) & {'--help', '-h'})
size = None
stream = args[1]
try:
if args[1] == '-':
stream = sys.stdin
else:
stream = open(args[1], 'rb')
term, step_back = args[2], len(args[2]) - 1
chunk_size = int(args[3]) if len(args) > 3 else DEFAULT_CHUNK_SIZE
if chunk_size <= step_back:
chunk_size = step_back + DEFAULT_CHUNK_SIZE
occurrences = 0
last_time = start = time()
size = None
if stream != sys.stdin and '-q' not in args:
size = os.fstat(stream.fileno()).st_size
_report(0, size, start, occurrences)
tail, chunk = '', stream.read(chunk_size)
while chunk:
data = tail + chunk
occurrences += data.count(term)
tail = chunk[-step_back:]
chunk = stream.read(chunk_size)
if size and time() - last_time > REPORT_EVERY_SECS:
size = os.fstat(stream.fileno()).st_size
_report(stream.tell(), size, start, occurrences)
last_time = time()
if size:
size = os.fstat(stream.fileno()).st_size
_report(size, size, start, occurrences)
sys.stdout.write('\n')
sys.stdout.flush()
print("%d" % occurrences)
finally:
if stream != sys.stdin:
stream.close()
if __name__ == '__main__':
_main(*sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment