Skip to content

Instantly share code, notes, and snippets.

@tdulcet
Last active May 9, 2023 10:55
Show Gist options
  • Save tdulcet/75f80d6a9da049b8378ca8ce9339f77f to your computer and use it in GitHub Desktop.
Save tdulcet/75f80d6a9da049b8378ca8ce9339f77f to your computer and use it in GitHub Desktop.
Generate word list from Wiktionary JSON extraction. Saves results to a TSV file.
#!/usr/bin/env python3
# Teal Dulcet
# Run: python3 wiktionary.py <input JSON file> <output TSV file>
# export LC_ALL=C.UTF-8
# sudo apt update
# sudo apt install hunspell-tools
# wget https://hg.mozilla.org/mozilla-central/raw-file/tip/extensions/spellcheck/locales/en-US/hunspell/en-US.aff
# wget https://hg.mozilla.org/mozilla-central/raw-file/tip/extensions/spellcheck/locales/en-US/hunspell/en-US.dic
# unmunch en-US.dic en-US.aff > temp.txt
# # Convert 'mozilla.txt' to UTF-8
# iconv -f ISO-8859-1 -t UTF-8 temp.txt > mozilla.txt
# wget https://kaikki.org/dictionary/English/kaikki.org-dictionary-English.json
# time python3 -X dev wiktionary.py kaikki.org-dictionary-English.json wiktionary.tsv
# Pick one:
# Lowercase letters only: cut -f 1 wiktionary.tsv | grep '^[[:lower:]'\''-]\+$' | grep '[[:lower:]]' > temp.txt
# Alphabetic with one lowercase letter: cut -f 1 wiktionary.tsv | grep '^[[:alpha:]'\''-]\+$' | grep '[[:lower:]]' > temp.txt
# Alphabetic (upper and lowercase): cut -f 1 wiktionary.tsv | grep '^[[:alpha:]'\''-]\+$' | grep '[[:alpha:]]' > temp.txt
# join -t $'\t' <(comm -13 <(sort -u mozilla.txt) <(sort temp.txt)) <(sort -t $'\t' -k 1,1 wiktionary.tsv) | sort -t $'\t' -k 2,2nr > 'Wiktionary words.tsv'
import csv
import json
import sys
import time
from collections import Counter
from datetime import timedelta
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <input JSON file> <output TSV file>", file=sys.stderr)
sys.exit(1)
# Allowed Parts of speech
parts_of_speech = frozenset(["noun", "verb", "adj", "adv", "prep_phrase", "abbrev", "pron", "prep", "num", "conj", "det", "particle", "postp", "intj"])
n = 10
words = set()
senses = {}
poss = {}
wikis = {}
forms = {}
keys = Counter()
part = Counter()
langs = Counter()
sources = Counter()
categories = Counter()
parents = Counter()
sense_keys = Counter()
sense_tags = Counter()
form_keys = Counter()
form_tags = Counter()
form_sources = Counter()
start = time.perf_counter()
with open(sys.argv[1], encoding="utf-8") as f:
for line in f:
data = json.loads(line)
keys.update(data.keys())
word = data["word"]
pos = data["pos"]
asenses = data["senses"]
part.update([pos])
langs.update([(data["lang_code"], data["lang"])])
if "source" in data:
sources.update([data["source"]])
if "categories" in data:
for category in data["categories"]:
categories.update([category["name"]])
parents.update(category["parents"])
if pos not in parts_of_speech:
continue
# if "wikipedia" not in data:
# continue
for sense in asenses:
sense_keys.update(sense.keys())
if "tags" in sense:
sense_tags.update(sense["tags"])
if all("tags" in s and (not {"obsolete", "archaic", "misspelling", "nonstandard"}.isdisjoint(s["tags"]) or (
not {"UK", "Britain", "British", "Commonwealth", "England", "Australia", "Australian", "Canada", "Canadian"}.isdisjoint(s["tags"]) and "US" not in s["tags"])) for s in asenses):
# print(f"Skiping: {word}, {pos}")
continue
if word not in words:
senses[word] = 0
poss[word] = set()
wikis[word] = set()
forms[word] = set()
words.add(word)
senses[word] += sum(1 for s in asenses if "glosses" in s)
poss[word].add(pos)
if "wikipedia" in data:
wikis[word].update(data["wikipedia"])
if "forms" in data:
for aform in data["forms"]:
form_keys.update(aform.keys())
if "tags" in aform:
form_tags.update(aform["tags"])
if "source" in aform:
form_sources.update([aform["source"]])
form = aform["form"]
temp = form.strip()
if form != temp:
print(f"Error: {form!r}", aform)
form = temp
if ("tags" not in aform or {"inflection-template", "table-tags", "class", "British",
"Canada", "Australian"}.isdisjoint(aform["tags"])) and form and form != "-":
# if form not in words:
# poss[form] = set()
# wikis[form] = set()
# words.add(form)
# poss[form].add(pos)
# if "wikipedia" in data:
# wikis[form].update(data["wikipedia"])
forms[word].add(form)
end = time.perf_counter()
print(f"Total number of Words: {len(words):n}, Runtime: {timedelta(seconds=end - start)}")
with open(sys.argv[2], "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile, delimiter="\t", lineterminator="\n")
# writer.writerow(["word", "senses", "form(s)", "part(s) of speech", "Wikipedia page(s)"])
for word in sorted(words):
writer.writerow([word, senses[word], ",".join(sorted(forms[word])) if forms[word] else "-", ",".join(
sorted(poss[word])), ",".join(map(repr, sorted(wikis[word]))) if word in wikis else "-"])
print("\nCounts\n")
print("Keys:", len(keys))
print("\n".join(f"\t{count}\t{key!r}" for key, count in keys.most_common()))
print("Part-of-speech:", len(part))
print("\n".join(f"\t{count}\t{pos}" for pos, count in part.most_common()))
print("Languages:", len(langs))
print("\n".join(f"\t{count}\t{lang}" for lang, count in langs.most_common()))
print("Sources:", len(sources))
print("\n".join(f"\t{count}\t{source}" for source, count in sources.most_common()))
print("Categories (names):", len(categories))
print("\n".join(f"\t{count}\t{category}" for category, count in categories.most_common(n)))
print("Categories (parents):", len(parents))
print("\n".join(f"\t{count}\t{category}" for category, count in parents.most_common(n)))
print("Sense Keys:", len(sense_keys))
print("\n".join(f"\t{count}\t{key!r}" for key, count in sense_keys.most_common()))
print("Sense Tags:", len(sense_tags))
print("\n".join(f"\t{count}\t{tag}" for tag, count in sense_tags.most_common(n * n)))
print("Form Keys:", len(form_keys))
print("\n".join(f"\t{count}\t{key!r}" for key, count in form_keys.most_common()))
print("Form Tags:", len(form_tags))
print("\n".join(f"\t{count}\t{tag}" for tag, count in form_tags.most_common(2 * n)))
print("Form Sources:", len(form_sources))
print("\n".join(f"\t{count}\t{source}" for source, count in form_sources.most_common()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment