Skip to content

Instantly share code, notes, and snippets.

@kjhenner
Last active March 9, 2020 08:49
Show Gist options
  • Save kjhenner/10de495df70347f5c2bdcc016dc89665 to your computer and use it in GitHub Desktop.
Save kjhenner/10de495df70347f5c2bdcc016dc89665 to your computer and use it in GitHub Desktop.
import regex as re
import sys
import os
from tqdm import tqdm
from collections import defaultdict
p = re.compile(r"([1-9][0-9\.\,]*)[\.\,\?\!]*\s+")
if __name__ == "__main__":
data_path = sys.argv[1]
paths = [os.path.join(dp, f)
for dp, dn, filenames in os.walk(data_path)
for f in filenames if os.path.splitext(f)[1] == '.txt']
number_counts = defaultdict(int)
for i, path in tqdm(list(enumerate(paths))):
with open(path) as f:
for m in re.findall(p, f.read()):
number_counts[re.sub(r'[^\d]', '', m)] += 1
number_counts = sorted(number_counts.items(), key = lambda x:-x[1])
if number_counts[0][0] == "42":
print("yes, 42 is the most used number on the internet")
else:
print("no, it's not; the following numbers are used more than 42:")
for count in number_counts:
if count[0] == "42":
break
print("{}: {} times".format(count[0], count[1]*225))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment