evgenybf/extracttrs.py

## extracttrs.py
"""
Extracts word transcriptions from En-En_Oxford Advanced Learners Dictionary.dsl

These words have invalid definitions:
 shit, to

"""

import re

IGNORE_TEXT_IN_PARENTHESIS = True

DICTIONARY_DSL = "En-En_Oxford Advanced Learners Dictionary.dsl"
IN_ENCODING = "utf-16"
OUT_ENCODING = "utf-8"

RE_PSPEECH = re.compile(r"\[c orange\](?:[ ]*)(.*?)(?:[, ]*)\[/c\]")
RE_TRAN = re.compile(r"\[c darkcyan\](?:\\\[)(.*?)(?:\\\])\[/c\]")
RE_P_BRE = re.compile(r"\[p\]BrE\[/p\]")
RE_P_AME = re.compile(r"\[p\]N?AmE\[/p\]")
RE_PARENTHESIS=re.compile(r"\([^()]*\)")

def read_all_lines(filename, encoding):
    with open(filename, "r", encoding=IN_ENCODING) as f:
        for line in f.readlines():
            yield line.rstrip()

def extract_trs(str):
    trs_bre = []
    trs_ame = []
    for part in RE_P_BRE.split(str):
        subpart = RE_P_AME.split(part, 1)
        trs_bre.extend(RE_TRAN.findall(subpart[0]))
        if len(subpart) > 1:
            trs_ame.extend(RE_TRAN.findall(subpart[1]))
    return trs_bre, trs_ame

class DefEntry:
    def __init__(self, pspeech, trs_bre, trs_ame):
        self.pspeech = pspeech
        self.trs_bre = trs_bre
        self.trs_ame = trs_ame

class WordDef:
    def __init__(self):
        self.words = []
        self.entries = []

def remove_text_parenths(s):
    while True:
        tmps = RE_PARENTHESIS.sub("", s)
        if tmps == s:
            break
        s = tmps
    return s

def parse_dsl():
    worddef = WordDef()
    wasworddef = False
    for lineno, line in enumerate(read_all_lines(DICTIONARY_DSL, IN_ENCODING)):
        if len(line) == 0 or line.startswith('#'):
            continue
        if line[0] != '\t':
            if wasworddef:
                if worddef.words:
                    yield worddef
                worddef = WordDef()
            worddef.words.append(line)
            wasworddef = False
        else:
            s_line = line.strip('\t')
            if s_line.startswith("[m0]") or s_line.startswith("[m1][c red]"):
                wasworddef = True
                pspeech = RE_PSPEECH.findall(s_line)
                trs_all = RE_TRAN.findall(s_line)
                if IGNORE_TEXT_IN_PARENTHESIS:
                    s_line = remove_text_parenths(s_line)
                if (RE_P_BRE.search(s_line) or RE_P_AME.search(s_line)):
                    trs_bre, trs_ame = extract_trs(s_line)
                    if len(trs_all) != len(trs_bre) + len(trs_ame):
                        print("WARNING: (%s) extracted too few transcriptions, line %d" % (",".join(worddef.words).encode("utf-8"), lineno))
                        print((s_line.encode("utf-8")))
                    if trs_bre or trs_ame:
                        worddef.entries.append(DefEntry(pspeech, trs_bre, trs_ame))
                elif len(trs_all) > 0:
                    print("WARNING: (%s) all transcriptions were left out, line %d" % (",".join(worddef.words).encode("utf-8"), lineno))
                    print((s_line.encode("utf-8")))
    if worddef.words:
        yield worddef

def wrap_tr(l):
    return ["[%s]" % s for s in l]

def list2str(l):
    return ",".join(l)

def gen_csv(fileout):
    import csv
    with open(fileout, "w", encoding=OUT_ENCODING, newline='') as f:
        writer = csv.writer(f, dialect="excel")
        writer.writerow(["word", "BrE", "NAmE", "both",])
        i = 0
        for worddef in parse_dsl():
            for word in worddef.words:
                i += 1
                if worddef.entries:
                    l_bre = []
                    l_ame = []
                    l_all = []
                    for entry in worddef.entries:
                        if len(worddef.entries) == 1:
                            s_pspeech = ""
                        else:
                            s_pspeech = list2str(entry.pspeech)
                            if s_pspeech:
                                s_pspeech += ": "
                        if len(entry.trs_bre) != len(set(entry.trs_bre)) or len(entry.trs_ame) != len(set(entry.trs_ame)):
                            print("WARNING: (%s) duplicated transcription" % (",".join(worddef.words).encode("utf-8"),))
                        s_bre = list2str(wrap_tr(entry.trs_bre))
                        s_ame = list2str(wrap_tr(entry.trs_ame))
                        if s_bre:
                            l_bre.append(s_pspeech + s_bre)
                        if s_ame:
                            l_ame.append(s_pspeech + s_ame)
                        if s_bre or s_ame:
                            s = s_pspeech
                            p = []
                            if s_bre == s_ame:
                                p.append(s_bre)
                            else:
                                if s_bre:
                                    p.append("BrE " + s_bre)
                                if s_ame:
                                    p.append("NAmE " + s_ame)
                            s += " ".join(p)
                            l_all.append(s)
                    row = (word, "; ".join(l_bre), "; ".join(l_ame), "; ".join(l_all),)
                    writer.writerow(row)
        print("Processed %d word(s)" % i)

if __name__ == "__main__":
    gen_csv("en_oxford_adv_dict.csv")

## mergetrs.py
"""
Adds word transcriptions from a dictionary generated by extracttrs.py into anki csv file.
"""

import csv
import re

RE_DELIM = re.compile(r"[,;]")


def normalize_word(word: str) -> str:
    word = word.replace("&nbsp;", " ")
    word = RE_DELIM.split(word, 1)[0]
    return word.strip().lower()


"""
dictcsvfile:
word,BrE,NAmE,both

ankicsvfile (no header)
N,keyword,transcription
"""


def mergetrs(dictcsvfile: str, ankicsvfile: str, outcsvfile: str) -> None:
    words = {}
    with open(dictcsvfile, "r", encoding="utf-8") as fin:
        r = csv.reader(fin, dialect="excel")
        # skip header
        r.__next__()
        for row in r:
            words[row[0].strip().lower()] = row[1:]
    with open(ankicsvfile, "r", encoding="utf-8") as fin, open(outcsvfile, "w", encoding="utf-8", newline='') as fout:
        r = csv.reader(fin, dialect="excel")
        w = csv.writer(fout, dialect="excel")
        for row in r:
            "&nbsp;"
            word = normalize_word(row[1])
            extra = words.get(word, [])
            if not extra:
                print("WARNING: %s not found" % word)
                extra = [""] * 3
            row += extra
            w.writerow(row)


if __name__ == "__main__":
    mergetrs("en_oxford_adv_dict.csv", "4000_eew_cloze.csv", "4000_eew_cloze_with_tr.csv")
    mergetrs("en_oxford_adv_dict.csv", "4000_eew_basic.csv", "4000_eew_basic_with_tr.csv")
	"""
	Extracts word transcriptions from En-En_Oxford Advanced Learners Dictionary.dsl

	These words have invalid definitions:
	shit, to

	"""

	import re

	IGNORE_TEXT_IN_PARENTHESIS = True

	DICTIONARY_DSL = "En-En_Oxford Advanced Learners Dictionary.dsl"
	IN_ENCODING = "utf-16"
	OUT_ENCODING = "utf-8"

	RE_PSPEECH = re.compile(r"\[c orange\](?:[ ])(.?)(?:[, ]*)\[/c\]")
	RE_TRAN = re.compile(r"\[c darkcyan\](?:\\\[)(.*?)(?:\\\])\[/c\]")
	RE_P_BRE = re.compile(r"\[p\]BrE\[/p\]")
	RE_P_AME = re.compile(r"\[p\]N?AmE\[/p\]")
	RE_PARENTHESIS=re.compile(r"\([^()]*\)")

	def read_all_lines(filename, encoding):
	with open(filename, "r", encoding=IN_ENCODING) as f:
	for line in f.readlines():
	yield line.rstrip()

	def extract_trs(str):
	trs_bre = []
	trs_ame = []
	for part in RE_P_BRE.split(str):
	subpart = RE_P_AME.split(part, 1)
	trs_bre.extend(RE_TRAN.findall(subpart[0]))
	if len(subpart) > 1:
	trs_ame.extend(RE_TRAN.findall(subpart[1]))
	return trs_bre, trs_ame

	class DefEntry:
	def __init__(self, pspeech, trs_bre, trs_ame):
	self.pspeech = pspeech
	self.trs_bre = trs_bre
	self.trs_ame = trs_ame

	class WordDef:
	def __init__(self):
	self.words = []
	self.entries = []

	def remove_text_parenths(s):
	while True:
	tmps = RE_PARENTHESIS.sub("", s)
	if tmps == s:
	break
	s = tmps
	return s

	def parse_dsl():
	worddef = WordDef()
	wasworddef = False
	for lineno, line in enumerate(read_all_lines(DICTIONARY_DSL, IN_ENCODING)):
	if len(line) == 0 or line.startswith('#'):
	continue
	if line[0] != '\t':
	if wasworddef:
	if worddef.words:
	yield worddef
	worddef = WordDef()
	worddef.words.append(line)
	wasworddef = False
	else:
	s_line = line.strip('\t')
	if s_line.startswith("[m0]") or s_line.startswith("[m1][c red]"):
	wasworddef = True
	pspeech = RE_PSPEECH.findall(s_line)
	trs_all = RE_TRAN.findall(s_line)
	if IGNORE_TEXT_IN_PARENTHESIS:
	s_line = remove_text_parenths(s_line)
	if (RE_P_BRE.search(s_line) or RE_P_AME.search(s_line)):
	trs_bre, trs_ame = extract_trs(s_line)
	if len(trs_all) != len(trs_bre) + len(trs_ame):
	print("WARNING: (%s) extracted too few transcriptions, line %d" % (",".join(worddef.words).encode("utf-8"), lineno))
	print((s_line.encode("utf-8")))
	if trs_bre or trs_ame:
	worddef.entries.append(DefEntry(pspeech, trs_bre, trs_ame))
	elif len(trs_all) > 0:
	print("WARNING: (%s) all transcriptions were left out, line %d" % (",".join(worddef.words).encode("utf-8"), lineno))
	print((s_line.encode("utf-8")))
	if worddef.words:
	yield worddef

	def wrap_tr(l):
	return ["[%s]" % s for s in l]

	def list2str(l):
	return ",".join(l)

	def gen_csv(fileout):
	import csv
	with open(fileout, "w", encoding=OUT_ENCODING, newline='') as f:
	writer = csv.writer(f, dialect="excel")
	writer.writerow(["word", "BrE", "NAmE", "both",])
	i = 0
	for worddef in parse_dsl():
	for word in worddef.words:
	i += 1
	if worddef.entries:
	l_bre = []
	l_ame = []
	l_all = []
	for entry in worddef.entries:
	if len(worddef.entries) == 1:
	s_pspeech = ""
	else:
	s_pspeech = list2str(entry.pspeech)
	if s_pspeech:
	s_pspeech += ": "
	if len(entry.trs_bre) != len(set(entry.trs_bre)) or len(entry.trs_ame) != len(set(entry.trs_ame)):
	print("WARNING: (%s) duplicated transcription" % (",".join(worddef.words).encode("utf-8"),))
	s_bre = list2str(wrap_tr(entry.trs_bre))
	s_ame = list2str(wrap_tr(entry.trs_ame))
	if s_bre:
	l_bre.append(s_pspeech + s_bre)
	if s_ame:
	l_ame.append(s_pspeech + s_ame)
	if s_bre or s_ame:
	s = s_pspeech
	p = []
	if s_bre == s_ame:
	p.append(s_bre)
	else:
	if s_bre:
	p.append("BrE " + s_bre)
	if s_ame:
	p.append("NAmE " + s_ame)
	s += " ".join(p)
	l_all.append(s)
	row = (word, "; ".join(l_bre), "; ".join(l_ame), "; ".join(l_all),)
	writer.writerow(row)
	print("Processed %d word(s)" % i)

	if __name__ == "__main__":
	gen_csv("en_oxford_adv_dict.csv")
	"""
	Adds word transcriptions from a dictionary generated by extracttrs.py into anki csv file.
	"""

	import csv
	import re

	RE_DELIM = re.compile(r"[,;]")


	def normalize_word(word: str) -> str:
	word = word.replace(" ", " ")
	word = RE_DELIM.split(word, 1)[0]
	return word.strip().lower()


	"""
	dictcsvfile:
	word,BrE,NAmE,both

	ankicsvfile (no header)
	N,keyword,transcription
	"""


	def mergetrs(dictcsvfile: str, ankicsvfile: str, outcsvfile: str) -> None:
	words = {}
	with open(dictcsvfile, "r", encoding="utf-8") as fin:
	r = csv.reader(fin, dialect="excel")
	# skip header
	r.__next__()
	for row in r:
	words[row[0].strip().lower()] = row[1:]
	with open(ankicsvfile, "r", encoding="utf-8") as fin, open(outcsvfile, "w", encoding="utf-8", newline='') as fout:
	r = csv.reader(fin, dialect="excel")
	w = csv.writer(fout, dialect="excel")
	for row in r:
	" "
	word = normalize_word(row[1])
	extra = words.get(word, [])
	if not extra:
	print("WARNING: %s not found" % word)
	extra = [""] * 3
	row += extra
	w.writerow(row)


	if __name__ == "__main__":
	mergetrs("en_oxford_adv_dict.csv", "4000_eew_cloze.csv", "4000_eew_cloze_with_tr.csv")
	mergetrs("en_oxford_adv_dict.csv", "4000_eew_basic.csv", "4000_eew_basic_with_tr.csv")