Skip to content

Instantly share code, notes, and snippets.

@tdulcet
Last active May 9, 2023 10:59
Show Gist options
  • Save tdulcet/b54041bbe532341617099bf1d26af093 to your computer and use it in GitHub Desktop.
Save tdulcet/b54041bbe532341617099bf1d26af093 to your computer and use it in GitHub Desktop.
Generate word list from Google Ngram data for 1-grams. Saves results to a TSV file.
#!/usr/bin/env python3
# Teal Dulcet
# Run: python3 google.py <input TSV file(s)>... <output TSV file>
# export LC_ALL=C.UTF-8
# sudo apt update
# sudo apt install hunspell-tools
# wget https://hg.mozilla.org/mozilla-central/raw-file/tip/extensions/spellcheck/locales/en-US/hunspell/en-US.aff
# wget https://hg.mozilla.org/mozilla-central/raw-file/tip/extensions/spellcheck/locales/en-US/hunspell/en-US.dic
# unmunch en-US.dic en-US.aff > temp.txt
# # Convert 'mozilla.txt' to UTF-8
# iconv -f ISO-8859-1 -t UTF-8 temp.txt > mozilla.txt
# wget https://storage.googleapis.com/books/ngrams/books/20200217/eng-us/1-000{00..13}-of-00014.gz
# gzip -dv 1-000{00..13}-of-00014.gz
# time python3 -X dev google.py 1-000{00..13}-of-00014 google.tsv
# cut -f 1 google.tsv | grep '^[[:alpha:]'\''-]\+$' | grep '[[:alpha:]]' > temp.txt
# join -t $'\t' -i -2 2 <(comm -13 <(tr '[:upper:]' '[:lower:]' < mozilla.txt | sort -u) <(tr '[:upper:]' '[:lower:]' < temp.txt | sort)) <(nl -d '' -w 1 google.tsv | sort -t $'\t' -f -k 2,2) | sort -t $'\t' -k 2,2n > 'Google Ngram.tsv'
# head -n 100000 'Google Ngram.tsv' > 'Google Ngram words.tsv'
import csv
import sys
import time
from collections import Counter
from datetime import timedelta
if len(sys.argv) < 3:
print(f"Usage: {sys.argv[0]} <input TSV file(s)>... <output TSV file>", file=sys.stderr)
sys.exit(1)
# Allowed Parts of speech
parts_of_speech = frozenset(
["NOUN", "VERB", "ADJ", "ADV", "PRON", "DET", "ADP", "NUM", "CONJ", "PRT", ".", "X"])
key = "match"
# words = set()
forms = {}
poss = {}
part = Counter()
words = 0
current = start = time.perf_counter()
files = sys.argv[1:-1]
for i, file in enumerate(files):
print(f"{i + 1:n}/{len(files):n} ({i / len(files):%}):\t{file!r}")
with open(file, newline="", encoding="utf-8") as csvfile:
reader = csv.reader(csvfile, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
ngram = row[0]
if "_" in ngram:
word, _, pos = ngram.rpartition("_")
if word and pos in parts_of_speech:
aword = word.lower()
part.update([pos])
if aword not in forms:
forms[aword] = {}
poss[aword] = set()
# words.add(aword)
poss[aword].add(pos)
if word not in forms[aword]:
forms[aword][word] = Counter()
acount = forms[aword][word]
for data in row[1:]:
year, match, volume = data.split(",")
acount.update(match=int(match), volume=int(volume))
end = time.perf_counter()
awords = len(forms)
print(f"Number of Words: {awords - words:n}, Time: {timedelta(seconds=end - current)}")
current = end
words = awords
print(f"\nTotal number of Words: {words:n}, Runtime: {timedelta(seconds=end - start)}")
with open(sys.argv[-1], "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile, delimiter="\t", lineterminator="\n", quotechar="", quoting=csv.QUOTE_NONE)
# writer.writerow(["word", "form(s)", "part(s) of speech", "match count", "volume count"])
count = ((word, sum(value.values(), Counter())) for word, value in forms.items())
for aword, acount in sorted(count, key=lambda x: x[1][key], reverse=True):
temp = forms[aword]
aforms = Counter({word: value[key] for word, value in temp.items()}).most_common()
writer.writerow([aword if aword in temp else aforms[0][0], ",".join(dict(aforms)) if len(aforms) > 1 else "-", ",".join(sorted(poss[aword])), acount["match"], acount["volume"]])
print("\nCounts\n")
print("Part-of-speech:", len(part))
print("\n".join(f"\t{count}\t{pos}" for pos, count in part.most_common()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment