Skip to content

Instantly share code, notes, and snippets.

@evgenybf
Last active December 21, 2015 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save evgenybf/e5689dfa3734dd4a100d to your computer and use it in GitHub Desktop.
Save evgenybf/e5689dfa3734dd4a100d to your computer and use it in GitHub Desktop.
"""
Extracts word transcriptions from En-En_Oxford Advanced Learners Dictionary.dsl
These words have invalid definitions:
shit, to
"""
import re
IGNORE_TEXT_IN_PARENTHESIS = True
DICTIONARY_DSL = "En-En_Oxford Advanced Learners Dictionary.dsl"
IN_ENCODING = "utf-16"
OUT_ENCODING = "utf-8"
RE_PSPEECH = re.compile(r"\[c orange\](?:[ ]*)(.*?)(?:[, ]*)\[/c\]")
RE_TRAN = re.compile(r"\[c darkcyan\](?:\\\[)(.*?)(?:\\\])\[/c\]")
RE_P_BRE = re.compile(r"\[p\]BrE\[/p\]")
RE_P_AME = re.compile(r"\[p\]N?AmE\[/p\]")
RE_PARENTHESIS=re.compile(r"\([^()]*\)")
def read_all_lines(filename, encoding):
with open(filename, "r", encoding=IN_ENCODING) as f:
for line in f.readlines():
yield line.rstrip()
def extract_trs(str):
trs_bre = []
trs_ame = []
for part in RE_P_BRE.split(str):
subpart = RE_P_AME.split(part, 1)
trs_bre.extend(RE_TRAN.findall(subpart[0]))
if len(subpart) > 1:
trs_ame.extend(RE_TRAN.findall(subpart[1]))
return trs_bre, trs_ame
class DefEntry:
def __init__(self, pspeech, trs_bre, trs_ame):
self.pspeech = pspeech
self.trs_bre = trs_bre
self.trs_ame = trs_ame
class WordDef:
def __init__(self):
self.words = []
self.entries = []
def remove_text_parenths(s):
while True:
tmps = RE_PARENTHESIS.sub("", s)
if tmps == s:
break
s = tmps
return s
def parse_dsl():
worddef = WordDef()
wasworddef = False
for lineno, line in enumerate(read_all_lines(DICTIONARY_DSL, IN_ENCODING)):
if len(line) == 0 or line.startswith('#'):
continue
if line[0] != '\t':
if wasworddef:
if worddef.words:
yield worddef
worddef = WordDef()
worddef.words.append(line)
wasworddef = False
else:
s_line = line.strip('\t')
if s_line.startswith("[m0]") or s_line.startswith("[m1][c red]"):
wasworddef = True
pspeech = RE_PSPEECH.findall(s_line)
trs_all = RE_TRAN.findall(s_line)
if IGNORE_TEXT_IN_PARENTHESIS:
s_line = remove_text_parenths(s_line)
if (RE_P_BRE.search(s_line) or RE_P_AME.search(s_line)):
trs_bre, trs_ame = extract_trs(s_line)
if len(trs_all) != len(trs_bre) + len(trs_ame):
print("WARNING: (%s) extracted too few transcriptions, line %d" % (",".join(worddef.words).encode("utf-8"), lineno))
print((s_line.encode("utf-8")))
if trs_bre or trs_ame:
worddef.entries.append(DefEntry(pspeech, trs_bre, trs_ame))
elif len(trs_all) > 0:
print("WARNING: (%s) all transcriptions were left out, line %d" % (",".join(worddef.words).encode("utf-8"), lineno))
print((s_line.encode("utf-8")))
if worddef.words:
yield worddef
def wrap_tr(l):
return ["[%s]" % s for s in l]
def list2str(l):
return ",".join(l)
def gen_csv(fileout):
import csv
with open(fileout, "w", encoding=OUT_ENCODING, newline='') as f:
writer = csv.writer(f, dialect="excel")
writer.writerow(["word", "BrE", "NAmE", "both",])
i = 0
for worddef in parse_dsl():
for word in worddef.words:
i += 1
if worddef.entries:
l_bre = []
l_ame = []
l_all = []
for entry in worddef.entries:
if len(worddef.entries) == 1:
s_pspeech = ""
else:
s_pspeech = list2str(entry.pspeech)
if s_pspeech:
s_pspeech += ": "
if len(entry.trs_bre) != len(set(entry.trs_bre)) or len(entry.trs_ame) != len(set(entry.trs_ame)):
print("WARNING: (%s) duplicated transcription" % (",".join(worddef.words).encode("utf-8"),))
s_bre = list2str(wrap_tr(entry.trs_bre))
s_ame = list2str(wrap_tr(entry.trs_ame))
if s_bre:
l_bre.append(s_pspeech + s_bre)
if s_ame:
l_ame.append(s_pspeech + s_ame)
if s_bre or s_ame:
s = s_pspeech
p = []
if s_bre == s_ame:
p.append(s_bre)
else:
if s_bre:
p.append("BrE " + s_bre)
if s_ame:
p.append("NAmE " + s_ame)
s += " ".join(p)
l_all.append(s)
row = (word, "; ".join(l_bre), "; ".join(l_ame), "; ".join(l_all),)
writer.writerow(row)
print("Processed %d word(s)" % i)
if __name__ == "__main__":
gen_csv("en_oxford_adv_dict.csv")
"""
Adds word transcriptions from a dictionary generated by extracttrs.py into anki csv file.
"""
import csv
import re
RE_DELIM = re.compile(r"[,;]")
def normalize_word(word: str) -> str:
word = word.replace(" ", " ")
word = RE_DELIM.split(word, 1)[0]
return word.strip().lower()
"""
dictcsvfile:
word,BrE,NAmE,both
ankicsvfile (no header)
N,keyword,transcription
"""
def mergetrs(dictcsvfile: str, ankicsvfile: str, outcsvfile: str) -> None:
words = {}
with open(dictcsvfile, "r", encoding="utf-8") as fin:
r = csv.reader(fin, dialect="excel")
# skip header
r.__next__()
for row in r:
words[row[0].strip().lower()] = row[1:]
with open(ankicsvfile, "r", encoding="utf-8") as fin, open(outcsvfile, "w", encoding="utf-8", newline='') as fout:
r = csv.reader(fin, dialect="excel")
w = csv.writer(fout, dialect="excel")
for row in r:
" "
word = normalize_word(row[1])
extra = words.get(word, [])
if not extra:
print("WARNING: %s not found" % word)
extra = [""] * 3
row += extra
w.writerow(row)
if __name__ == "__main__":
mergetrs("en_oxford_adv_dict.csv", "4000_eew_cloze.csv", "4000_eew_cloze_with_tr.csv")
mergetrs("en_oxford_adv_dict.csv", "4000_eew_basic.csv", "4000_eew_basic_with_tr.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment