Last active
May 9, 2023 10:59
-
-
Save tdulcet/b54041bbe532341617099bf1d26af093 to your computer and use it in GitHub Desktop.
Generate word list from Google Ngram data for 1-grams. Saves results to a TSV file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Teal Dulcet | |
# Run: python3 google.py <input TSV file(s)>... <output TSV file> | |
# export LC_ALL=C.UTF-8 | |
# sudo apt update | |
# sudo apt install hunspell-tools | |
# wget https://hg.mozilla.org/mozilla-central/raw-file/tip/extensions/spellcheck/locales/en-US/hunspell/en-US.aff | |
# wget https://hg.mozilla.org/mozilla-central/raw-file/tip/extensions/spellcheck/locales/en-US/hunspell/en-US.dic | |
# unmunch en-US.dic en-US.aff > temp.txt | |
# # Convert 'mozilla.txt' to UTF-8 | |
# iconv -f ISO-8859-1 -t UTF-8 temp.txt > mozilla.txt | |
# wget https://storage.googleapis.com/books/ngrams/books/20200217/eng-us/1-000{00..13}-of-00014.gz | |
# gzip -dv 1-000{00..13}-of-00014.gz | |
# time python3 -X dev google.py 1-000{00..13}-of-00014 google.tsv | |
# cut -f 1 google.tsv | grep '^[[:alpha:]'\''-]\+$' | grep '[[:alpha:]]' > temp.txt | |
# join -t $'\t' -i -2 2 <(comm -13 <(tr '[:upper:]' '[:lower:]' < mozilla.txt | sort -u) <(tr '[:upper:]' '[:lower:]' < temp.txt | sort)) <(nl -d '' -w 1 google.tsv | sort -t $'\t' -f -k 2,2) | sort -t $'\t' -k 2,2n > 'Google Ngram.tsv' | |
# head -n 100000 'Google Ngram.tsv' > 'Google Ngram words.tsv' | |
import csv | |
import sys | |
import time | |
from collections import Counter | |
from datetime import timedelta | |
if len(sys.argv) < 3: | |
print(f"Usage: {sys.argv[0]} <input TSV file(s)>... <output TSV file>", file=sys.stderr) | |
sys.exit(1) | |
# Allowed Parts of speech | |
parts_of_speech = frozenset( | |
["NOUN", "VERB", "ADJ", "ADV", "PRON", "DET", "ADP", "NUM", "CONJ", "PRT", ".", "X"]) | |
key = "match" | |
# words = set() | |
forms = {} | |
poss = {} | |
part = Counter() | |
words = 0 | |
current = start = time.perf_counter() | |
files = sys.argv[1:-1] | |
for i, file in enumerate(files): | |
print(f"{i + 1:n}/{len(files):n} ({i / len(files):%}):\t{file!r}") | |
with open(file, newline="", encoding="utf-8") as csvfile: | |
reader = csv.reader(csvfile, delimiter="\t", quoting=csv.QUOTE_NONE) | |
for row in reader: | |
ngram = row[0] | |
if "_" in ngram: | |
word, _, pos = ngram.rpartition("_") | |
if word and pos in parts_of_speech: | |
aword = word.lower() | |
part.update([pos]) | |
if aword not in forms: | |
forms[aword] = {} | |
poss[aword] = set() | |
# words.add(aword) | |
poss[aword].add(pos) | |
if word not in forms[aword]: | |
forms[aword][word] = Counter() | |
acount = forms[aword][word] | |
for data in row[1:]: | |
year, match, volume = data.split(",") | |
acount.update(match=int(match), volume=int(volume)) | |
end = time.perf_counter() | |
awords = len(forms) | |
print(f"Number of Words: {awords - words:n}, Time: {timedelta(seconds=end - current)}") | |
current = end | |
words = awords | |
print(f"\nTotal number of Words: {words:n}, Runtime: {timedelta(seconds=end - start)}") | |
with open(sys.argv[-1], "w", newline="", encoding="utf-8") as csvfile: | |
writer = csv.writer(csvfile, delimiter="\t", lineterminator="\n", quotechar="", quoting=csv.QUOTE_NONE) | |
# writer.writerow(["word", "form(s)", "part(s) of speech", "match count", "volume count"]) | |
count = ((word, sum(value.values(), Counter())) for word, value in forms.items()) | |
for aword, acount in sorted(count, key=lambda x: x[1][key], reverse=True): | |
temp = forms[aword] | |
aforms = Counter({word: value[key] for word, value in temp.items()}).most_common() | |
writer.writerow([aword if aword in temp else aforms[0][0], ",".join(dict(aforms)) if len(aforms) > 1 else "-", ",".join(sorted(poss[aword])), acount["match"], acount["volume"]]) | |
print("\nCounts\n") | |
print("Part-of-speech:", len(part)) | |
print("\n".join(f"\t{count}\t{pos}" for pos, count in part.most_common())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment