Last active
May 9, 2023 10:55
-
-
Save tdulcet/75f80d6a9da049b8378ca8ce9339f77f to your computer and use it in GitHub Desktop.
Generate word list from Wiktionary JSON extraction. Saves results to a TSV file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Teal Dulcet | |
# Run: python3 wiktionary.py <input JSON file> <output TSV file> | |
# export LC_ALL=C.UTF-8 | |
# sudo apt update | |
# sudo apt install hunspell-tools | |
# wget https://hg.mozilla.org/mozilla-central/raw-file/tip/extensions/spellcheck/locales/en-US/hunspell/en-US.aff | |
# wget https://hg.mozilla.org/mozilla-central/raw-file/tip/extensions/spellcheck/locales/en-US/hunspell/en-US.dic | |
# unmunch en-US.dic en-US.aff > temp.txt | |
# # Convert 'mozilla.txt' to UTF-8 | |
# iconv -f ISO-8859-1 -t UTF-8 temp.txt > mozilla.txt | |
# wget https://kaikki.org/dictionary/English/kaikki.org-dictionary-English.json | |
# time python3 -X dev wiktionary.py kaikki.org-dictionary-English.json wiktionary.tsv | |
# Pick one: | |
# Lowercase letters only: cut -f 1 wiktionary.tsv | grep '^[[:lower:]'\''-]\+$' | grep '[[:lower:]]' > temp.txt | |
# Alphabetic with one lowercase letter: cut -f 1 wiktionary.tsv | grep '^[[:alpha:]'\''-]\+$' | grep '[[:lower:]]' > temp.txt | |
# Alphabetic (upper and lowercase): cut -f 1 wiktionary.tsv | grep '^[[:alpha:]'\''-]\+$' | grep '[[:alpha:]]' > temp.txt | |
# join -t $'\t' <(comm -13 <(sort -u mozilla.txt) <(sort temp.txt)) <(sort -t $'\t' -k 1,1 wiktionary.tsv) | sort -t $'\t' -k 2,2nr > 'Wiktionary words.tsv' | |
import csv | |
import json | |
import sys | |
import time | |
from collections import Counter | |
from datetime import timedelta | |
if len(sys.argv) != 3: | |
print(f"Usage: {sys.argv[0]} <input JSON file> <output TSV file>", file=sys.stderr) | |
sys.exit(1) | |
# Allowed Parts of speech | |
parts_of_speech = frozenset(["noun", "verb", "adj", "adv", "prep_phrase", "abbrev", "pron", "prep", "num", "conj", "det", "particle", "postp", "intj"]) | |
n = 10 | |
words = set() | |
senses = {} | |
poss = {} | |
wikis = {} | |
forms = {} | |
keys = Counter() | |
part = Counter() | |
langs = Counter() | |
sources = Counter() | |
categories = Counter() | |
parents = Counter() | |
sense_keys = Counter() | |
sense_tags = Counter() | |
form_keys = Counter() | |
form_tags = Counter() | |
form_sources = Counter() | |
start = time.perf_counter() | |
with open(sys.argv[1], encoding="utf-8") as f: | |
for line in f: | |
data = json.loads(line) | |
keys.update(data.keys()) | |
word = data["word"] | |
pos = data["pos"] | |
asenses = data["senses"] | |
part.update([pos]) | |
langs.update([(data["lang_code"], data["lang"])]) | |
if "source" in data: | |
sources.update([data["source"]]) | |
if "categories" in data: | |
for category in data["categories"]: | |
categories.update([category["name"]]) | |
parents.update(category["parents"]) | |
if pos not in parts_of_speech: | |
continue | |
# if "wikipedia" not in data: | |
# continue | |
for sense in asenses: | |
sense_keys.update(sense.keys()) | |
if "tags" in sense: | |
sense_tags.update(sense["tags"]) | |
if all("tags" in s and (not {"obsolete", "archaic", "misspelling", "nonstandard"}.isdisjoint(s["tags"]) or ( | |
not {"UK", "Britain", "British", "Commonwealth", "England", "Australia", "Australian", "Canada", "Canadian"}.isdisjoint(s["tags"]) and "US" not in s["tags"])) for s in asenses): | |
# print(f"Skiping: {word}, {pos}") | |
continue | |
if word not in words: | |
senses[word] = 0 | |
poss[word] = set() | |
wikis[word] = set() | |
forms[word] = set() | |
words.add(word) | |
senses[word] += sum(1 for s in asenses if "glosses" in s) | |
poss[word].add(pos) | |
if "wikipedia" in data: | |
wikis[word].update(data["wikipedia"]) | |
if "forms" in data: | |
for aform in data["forms"]: | |
form_keys.update(aform.keys()) | |
if "tags" in aform: | |
form_tags.update(aform["tags"]) | |
if "source" in aform: | |
form_sources.update([aform["source"]]) | |
form = aform["form"] | |
temp = form.strip() | |
if form != temp: | |
print(f"Error: {form!r}", aform) | |
form = temp | |
if ("tags" not in aform or {"inflection-template", "table-tags", "class", "British", | |
"Canada", "Australian"}.isdisjoint(aform["tags"])) and form and form != "-": | |
# if form not in words: | |
# poss[form] = set() | |
# wikis[form] = set() | |
# words.add(form) | |
# poss[form].add(pos) | |
# if "wikipedia" in data: | |
# wikis[form].update(data["wikipedia"]) | |
forms[word].add(form) | |
end = time.perf_counter() | |
print(f"Total number of Words: {len(words):n}, Runtime: {timedelta(seconds=end - start)}") | |
with open(sys.argv[2], "w", newline="", encoding="utf-8") as csvfile: | |
writer = csv.writer(csvfile, delimiter="\t", lineterminator="\n") | |
# writer.writerow(["word", "senses", "form(s)", "part(s) of speech", "Wikipedia page(s)"]) | |
for word in sorted(words): | |
writer.writerow([word, senses[word], ",".join(sorted(forms[word])) if forms[word] else "-", ",".join( | |
sorted(poss[word])), ",".join(map(repr, sorted(wikis[word]))) if word in wikis else "-"]) | |
print("\nCounts\n") | |
print("Keys:", len(keys)) | |
print("\n".join(f"\t{count}\t{key!r}" for key, count in keys.most_common())) | |
print("Part-of-speech:", len(part)) | |
print("\n".join(f"\t{count}\t{pos}" for pos, count in part.most_common())) | |
print("Languages:", len(langs)) | |
print("\n".join(f"\t{count}\t{lang}" for lang, count in langs.most_common())) | |
print("Sources:", len(sources)) | |
print("\n".join(f"\t{count}\t{source}" for source, count in sources.most_common())) | |
print("Categories (names):", len(categories)) | |
print("\n".join(f"\t{count}\t{category}" for category, count in categories.most_common(n))) | |
print("Categories (parents):", len(parents)) | |
print("\n".join(f"\t{count}\t{category}" for category, count in parents.most_common(n))) | |
print("Sense Keys:", len(sense_keys)) | |
print("\n".join(f"\t{count}\t{key!r}" for key, count in sense_keys.most_common())) | |
print("Sense Tags:", len(sense_tags)) | |
print("\n".join(f"\t{count}\t{tag}" for tag, count in sense_tags.most_common(n * n))) | |
print("Form Keys:", len(form_keys)) | |
print("\n".join(f"\t{count}\t{key!r}" for key, count in form_keys.most_common())) | |
print("Form Tags:", len(form_tags)) | |
print("\n".join(f"\t{count}\t{tag}" for tag, count in form_tags.most_common(2 * n))) | |
print("Form Sources:", len(form_sources)) | |
print("\n".join(f"\t{count}\t{source}" for source, count in form_sources.most_common())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment