kjhenner/fortytwo.py

## fortytwo.py
import regex as re
import sys
import os
from tqdm import tqdm
from collections import defaultdict

p = re.compile(r"([1-9][0-9\.\,]*)[\.\,\?\!]*\s+")

if __name__ == "__main__":
    data_path = sys.argv[1]
    paths = [os.path.join(dp, f)
            for dp, dn, filenames in os.walk(data_path)
            for f in filenames if os.path.splitext(f)[1] == '.txt']
    number_counts = defaultdict(int)
    for i, path in tqdm(list(enumerate(paths))):
        with open(path) as f:
            for m in re.findall(p, f.read()):
                number_counts[re.sub(r'[^\d]', '', m)] += 1
    number_counts = sorted(number_counts.items(), key = lambda x:-x[1])
    if number_counts[0][0] == "42":
        print("yes, 42 is the most used number on the internet")
    else:
        print("no, it's not; the following numbers are used more than 42:")
        for count in number_counts:
            if count[0] == "42":
                break
            print("{}: {} times".format(count[0], count[1]*225))
	import regex as re
	import sys
	import os
	from tqdm import tqdm
	from collections import defaultdict

	p = re.compile(r"([1-9][0-9\.\,])[\.\,\?\!]\s+")

	if __name__ == "__main__":
	data_path = sys.argv[1]
	paths = [os.path.join(dp, f)
	for dp, dn, filenames in os.walk(data_path)
	for f in filenames if os.path.splitext(f)[1] == '.txt']
	number_counts = defaultdict(int)
	for i, path in tqdm(list(enumerate(paths))):
	with open(path) as f:
	for m in re.findall(p, f.read()):
	number_counts[re.sub(r'[^\d]', '', m)] += 1
	number_counts = sorted(number_counts.items(), key = lambda x:-x[1])
	if number_counts[0][0] == "42":
	print("yes, 42 is the most used number on the internet")
	else:
	print("no, it's not; the following numbers are used more than 42:")
	for count in number_counts:
	if count[0] == "42":
	break
	print("{}: {} times".format(count[0], count[1]*225))