Last active
October 21, 2018 15:35
-
-
Save oversider-kosma/cc564b9774bcf904a8d5cf697f45e805 to your computer and use it in GitHub Desktop.
Counts the appearance of a word in a (probably large) file or stream, reading it by chunks.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
"""Counts the appearance of a word in a (probably large) file or stream reading it by chunks.""" | |
import os | |
import sys | |
from time import time | |
DEFAULT_CHUNK_SIZE = 2048 | |
REPORT_EVERY_SECS = 1 | |
def _print_usage(): | |
usage = [] | |
usage.append('Usage: %s <file> <word> [chunk_size] [-q]\n' % sys.argv[0]) | |
usage.append(' Counts occurrences of <word> in (probably large) <file> reading it by chunks.') | |
usage.append(' If <file> is "-" then stdin will be used as input.\n') | |
usage.append(' Options:') | |
usage.append(' -q\t\tto disable progress report (forced if read goes from stdin)') | |
print('\n'.join(usage)) | |
def _humanreadable_time(secs): | |
hours = mins = 0 | |
mins, secs = divmod(secs, 60) | |
hours, mins = divmod(mins, 60) | |
return "%02d:%02d:%02d" % (hours, mins, secs) | |
def _report(tell, size, start_time, occurrences): | |
if not size: | |
return | |
eta = speed = '???' | |
seconds = int(time() - start_time) | |
prcnt = 100. * tell / size | |
seconds_per_one_pcnt = seconds / prcnt if prcnt else 0 | |
if seconds_per_one_pcnt: | |
eta = _humanreadable_time(int((100 - prcnt) * seconds_per_one_pcnt)) | |
if seconds: | |
speed = '%0.1f' % (prcnt / seconds) | |
msg = '\r%0.1f%% in %s ' % (prcnt, _humanreadable_time(seconds)) | |
msg += 'Found: %d ' % occurrences | |
msg += 'Avg speed: %s %%/s ' % speed | |
if tell != size: | |
msg += 'ETA: %s' % eta | |
msg += ' ' * (80 - len(msg)) | |
sys.stdout.write(msg) | |
sys.stdout.flush() | |
def _main(*args): | |
if len(args) < 2 or set(args) & {'--help', '-h'}: | |
_print_usage() | |
exit(not set(args) & {'--help', '-h'}) | |
size = None | |
stream = args[1] | |
try: | |
if args[1] == '-': | |
stream = sys.stdin | |
else: | |
stream = open(args[1], 'rb') | |
term, step_back = args[2], len(args[2]) - 1 | |
chunk_size = int(args[3]) if len(args) > 3 else DEFAULT_CHUNK_SIZE | |
if chunk_size <= step_back: | |
chunk_size = step_back + DEFAULT_CHUNK_SIZE | |
occurrences = 0 | |
last_time = start = time() | |
size = None | |
if stream != sys.stdin and '-q' not in args: | |
size = os.fstat(stream.fileno()).st_size | |
_report(0, size, start, occurrences) | |
tail, chunk = '', stream.read(chunk_size) | |
while chunk: | |
data = tail + chunk | |
occurrences += data.count(term) | |
tail = chunk[-step_back:] | |
chunk = stream.read(chunk_size) | |
if size and time() - last_time > REPORT_EVERY_SECS: | |
size = os.fstat(stream.fileno()).st_size | |
_report(stream.tell(), size, start, occurrences) | |
last_time = time() | |
if size: | |
size = os.fstat(stream.fileno()).st_size | |
_report(size, size, start, occurrences) | |
sys.stdout.write('\n') | |
sys.stdout.flush() | |
print("%d" % occurrences) | |
finally: | |
if stream != sys.stdin: | |
stream.close() | |
if __name__ == '__main__': | |
_main(*sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment