Skip to content

Instantly share code, notes, and snippets.

@dimaqq
Created January 11, 2021 03:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dimaqq/0d0728f3eb1a55b1d7653117e55926f4 to your computer and use it in GitHub Desktop.
Save dimaqq/0d0728f3eb1a55b1d7653117e55926f4 to your computer and use it in GitHub Desktop.
from collections import Counter
from pathlib import Path
def ngrams(n, s):
# lazy, but trailing sub-n-grams are not statistically significant
for i in range(len(s)):
yield s[i:i+n]
if __name__ == "__main__":
import sys
_, *args = sys.argv
if args[0].startswith("-"):
(_, gram), *args = args
gram = int(gram)
else:
gram = 2
data = "\n".join(Path(f).read_text().lower() for f in args)
c = Counter(ngrams(gram, data))
top = list(reversed(sorted((v,k) for k,v in c.items())))
total = sum(v[0] for v in top)
for v,k in top[:20]:
print(f"{v/total:.3f}: {k!r}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment