Skip to content

Instantly share code, notes, and snippets.

@rmmh
Created July 16, 2019 09:05
Show Gist options
  • Save rmmh/6bcc6de6153f0467b1bec686a9ad832d to your computer and use it in GitHub Desktop.
Save rmmh/6bcc6de6153f0467b1bec686a9ad832d to your computer and use it in GitHub Desktop.
Project Gutenberg Initialisms
#!/usr/bin/env python3
# an experiment in using a large corpus of phrases to generate initialisms from
import re
import zipfile
from nltk import tokenize
phrases = {}
shorts = {}
# https://web.eecs.umich.edu/~lahiri/gutenberg_dataset.html
with zipfile.ZipFile("Gutenberg.zip") as z:
for f in sorted(z.filelist, key=lambda x: x.filename):
print(f.filename, len(shorts), len(phrases))
for s in tokenize.sent_tokenize(z.read(f).decode('utf8')):
phrase = re.sub(r'[\s]+', ' ', re.sub(r'[^-a-z \s]', '', s.lower()).strip())
short = ''.join(x[0] for x in phrase.split())
if 4 <= len(short) <= 12:
shorts.setdefault(short, []).append((phrase, f.filename))
for _ in range(3):
s = secrets.choice(shorts)
print(s,shorts[s])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment