Last active
September 6, 2019 14:30
-
-
Save zelinskiy/53cf16a04786fe133facbacb6f47cb56 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from cltk.corpus.utils.importer import CorpusImporter | |
from cltk.stem.lemma import LemmaReplacer | |
from cltk.tokenize.latin.sentence import SentenceTokenizer | |
from nltk.tokenize.punkt import PunktLanguageVars | |
from pystardict import Dictionary | |
from open_words.parse import Parse | |
import cltk.prosody.latin.string_utils as string_utils | |
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer | |
from cltk.tag.pos import POSTag | |
from bs4 import BeautifulSoup | |
import ebooklib | |
from ebooklib import epub | |
from ebooklib.utils import parse_html_string | |
import random as rand | |
from datetime import datetime | |
import shutil | |
import os | |
import sys | |
from glob import glob | |
import re | |
from time import time, strftime | |
# TODO: | |
# Split dictionaries in smaller files | |
# Abbreviations version of postag_to_str | |
# Web app | |
# CLI | |
# "See *ref*" in Lewis | |
# Get texts from ltin library | |
# Browser extension | |
MAX_SYMBOLS = 1000 | |
USE_WHITAKER = False | |
STOPS = ['.', ',', '?', '', '[', ']', ';', ':', '!', '(', ')'] | |
DEIMITATIONE="Qui sequitur me non ambulat in tenebris dicit Dominus. Hæc sunt verba Christi, quibus admonemur quatenus vitam eius et mores imitemur, si volumus veraciter illuminari, et ab omni cæcitate cordis liberari. Summum igitur studium nostrum, sit in vita Jesu meditari." | |
DICTIONARIZE = True | |
GRAMMARIZE = True | |
def postag(taggers, sentence): | |
outs = [] | |
for tagger in taggers: | |
outs.append(tagger(sentence)) | |
res = outs[0] | |
for i, (w, tag) in enumerate(res): | |
if tag in ['Unk', None]: | |
for out in outs[1:]: | |
if i < len(out) and out[i] not in ['Unk', None]: | |
res[i] = w, out[i][1] | |
break | |
return res | |
def postag_to_str(tag_): | |
if tag_ in ['Unk', None]: | |
return None | |
else: | |
tag = tag_.lower() | |
ret = [] | |
pars = tag[0] | |
if pars == 'n': | |
ret.append("Nomen") | |
elif pars == 'v': | |
ret.append("Verbum") | |
elif pars == 't': | |
ret.append("Participium") | |
elif pars == 'a': | |
ret.append("Adiectivum") | |
elif pars == 'd': | |
ret.append("Adverbium") | |
elif pars == 'c': | |
ret.append("Coniunctio") | |
elif pars == 'r': | |
ret.append("Praepositio") | |
elif pars == 'p': | |
ret.append("Pronomen") | |
elif pars == 'm': | |
ret.append("Numerale") | |
elif pars == 'i': | |
ret.append("Interjectio") | |
elif pars == 'e': | |
ret.append("Exclamatio") | |
elif pars == 'u': | |
ret.append("Interpunctum") | |
persona = tag[1] | |
if persona == '1': | |
ret.append("1 persona") | |
elif persona == '2': | |
ret.append("2 persona") | |
elif persona == '3': | |
ret.append("3 persona") | |
numerus = tag[2] | |
if numerus == 's': | |
ret.append("Singularis") | |
elif numerus == 'p': | |
ret.append("Pluralis") | |
tempus = tag[3] | |
if tempus == 'p': | |
ret.append("Praesens") | |
elif tempus == 'i': | |
ret.append("Imperfectum") | |
elif tempus == 'r': | |
ret.append("Perfectum") | |
elif tempus == 'l': | |
ret.append("Plusquamperfectum") | |
elif tempus == 't': | |
ret.append("Futurum exactum") | |
elif tempus == 'f': | |
ret.append("Futurum") | |
modus = tag[4] | |
if modus == 'i': | |
ret.append("Indicativus") | |
elif modus == 's': | |
ret.append("Coniunctivus") | |
elif modus == 'n': | |
ret.append("Infinitivus") | |
elif modus == 'm': | |
ret.append("Imperativus") | |
elif modus == 'p': | |
ret.append("Participium") | |
elif modus == 'd': | |
ret.append("Gerunduim") | |
elif modus == 'g': | |
ret.append("Gerundivum") | |
elif modus == 'u': | |
ret.append("Supinum") | |
genus_verbi = tag[5] | |
if genus_verbi == 'a': | |
ret.append("Activum") | |
elif genus_verbi == 'p': | |
ret.append("Passivum") | |
genus = tag[6] | |
if genus == 'm': | |
ret.append("Masculinum") | |
elif genus == 'f': | |
ret.append("Femininum") | |
elif genus == 'n': | |
ret.append("Neutrum") | |
casus = tag[7] | |
if casus == 'n': | |
ret.append("Nominativus") | |
elif casus == 'g': | |
ret.append("Genetivus") | |
elif casus == 'd': | |
ret.append("Dativus") | |
elif casus == 'a': | |
ret.append("Accusativus") | |
elif casus == 'v': | |
ret.append("Vocativus") | |
elif casus == 'b': | |
ret.append("Ablativus") | |
elif casus == 'l': | |
ret.append("Locativus") | |
comparatio = tag[8] | |
if comparatio == 'c': | |
ret.append("Comparativus") | |
if comparatio == 's': | |
ret.append("Superlativus") | |
return " ".join(ret) | |
''' | |
ci = CorpusImporter('latin') | |
corpora = [#'latin_text_perseus', | |
#'latin_treebank_perseus', | |
#'latin_text_latin_library', | |
#'phi5', | |
#'phi7', | |
#'latin_proper_names_cltk', | |
#'latin_models_cltk', | |
#'latin_pos_lemmata_cltk', | |
'latin_treebank_index_thomisticus', | |
#'latin_lexica_perseus', | |
#'latin_training_set_sentence_cltk', | |
#'latin_word2vec_cltk', | |
#'latin_text_antique_digiliblt', | |
#'latin_text_corpus_grammaticorum_latinorum', | |
#'latin_text_poeti_ditalia', | |
#'latin_text_tesserae'] | |
for cn in corpora: | |
ci.import_corpus(cn) | |
print("{} done".format(cn)) | |
''' | |
t0 = time() | |
lemmatizer = LemmaReplacer('latin') | |
b_lemmatizer = BackoffLatinLemmatizer() | |
tagger = POSTag('latin') | |
taggers = [ | |
tagger.tag_ngram_123_backoff, | |
# tagger.tag_tnt, | |
# tagger.tag_crf, | |
# tagger.tag_unigram | |
] | |
sentence_tokenizer = SentenceTokenizer() | |
word_tokenizer = PunktLanguageVars() | |
rus_dictionary = Dictionary('/home/nic/Desktop/latrus/latrus') | |
eng_dictionary = Dictionary('/home/nic/Desktop/Lewis1890/Lewis1890') | |
dicts = [rus_dictionary, eng_dictionary] | |
whitaker = Parse() | |
out_dict = {} | |
conj_dict = {} | |
misses = set() | |
conj_misses = set() | |
regex = r'\W+|\d+' | |
bookfile = sys.argv[1] | |
bookname = bookfile.replace('.epub', '') | |
book = epub.read_epub(bookfile) | |
for text_item in list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT)): | |
print("Processing {}".format(text_item.file_name)) | |
soup = BeautifulSoup(text_item.get_content(), "lxml") | |
for element in soup.body.find_all(text=True): | |
sentences = sentence_tokenizer.tokenize(element.string) | |
res = [] | |
for (i, s) in enumerate(sentences): | |
#ws = s.split() #word_tokenizer.word_tokenize(s) | |
tagged_ws = postag(taggers, s) | |
for (j, (w, tag)) in enumerate(tagged_ws): | |
w_ = w.lower().replace("æ", "ae").replace("œ", "oe").replace("j", "i") | |
w_ = re.sub(regex, '', w_) | |
if w_ in STOPS: | |
res.append(w) | |
else: | |
if DICTIONARIZE: | |
lemma = lemmatizer.lemmatize([w_])[0] | |
lemma = re.sub(regex, '', lemma) | |
found = False | |
if lemma not in out_dict: | |
for d in dicts: | |
if not found and lemma in d: | |
sense = BeautifulSoup(d[lemma], "html.parser").get_text() | |
sense = sense.replace("\n", " ") | |
out_dict[lemma] = sense[:MAX_SYMBOLS].replace(lemma, lemma[0] + ".") | |
found = True | |
else: | |
found = True | |
if not found: | |
misses.add(w) | |
else: | |
found = False | |
if GRAMMARIZE: | |
if w_ not in conj_dict: | |
conj_data = postag_to_str(tag) | |
conjugated = conj_data is not None | |
if conjugated: | |
conj_dict[w_] = (lemma if found else None, conj_data) | |
else: | |
conjugated = True | |
if not conjugated: | |
conj_misses.add(w_) | |
else: | |
conjugated = False | |
if conjugated: | |
res.append(' <a href="conj.xhtml#{1}">{0}</a> '.format(w, w_)) | |
elif found: | |
res.append(' <a href="dict.xhtml#{1}">{0}</a> '.format(w, lemma)) | |
else: | |
res.append(" " + w) | |
element.replace_with(BeautifulSoup("".join(res), "html.parser")) | |
chapter_str = soup.prettify(formatter='lxml') | |
text_item.set_content(chapter_str.encode('utf-8')) | |
soup = BeautifulSoup('<html><head><title>Dictionary</title></head><body></body></html>', 'html.parser') | |
for k in sorted(out_dict): | |
p = soup.new_tag('p', id=k) | |
innerHTML = '<b>{0}</b> {1}'.format(k, out_dict[k]) | |
p.append(BeautifulSoup(innerHTML, 'html.parser')) | |
soup.html.body.append(p) | |
dict_chap = epub.EpubHtml(title='Dictionary', file_name='dict.xhtml', lang='en') | |
dict_chap.content = soup.prettify(formatter='lxml') | |
dict_chap.id = 'dict' | |
book.add_item(dict_chap) | |
book.toc.append(epub.Link(href="dict.xhtml", title="Dictionary")) | |
book.spine.append((dict_chap.id, 'yes')) | |
soup = BeautifulSoup('<html><head><title>Conjugations</title></head><body></body></html>', 'html.parser') | |
for k in sorted(conj_dict): | |
p = soup.new_tag('p', id=k) | |
lemma, conj = conj_dict[k] | |
if lemma is None: | |
dict_link = "?" | |
else: | |
dict_link = '<a href="dict.xhtml#{0}">{0}</a>'.format(lemma) | |
innerHTML = '<b>{0}</b> {1} {2}'.format(k, dict_link, conj) | |
p.append(BeautifulSoup(innerHTML, 'html.parser')) | |
soup.html.body.append(p) | |
conj_chap = epub.EpubHtml(title='Conjugations', file_name='conj.xhtml', lang='en') | |
conj_chap.content = soup.prettify(formatter='lxml') | |
conj_chap.id = 'conj' | |
book.add_item(conj_chap) | |
book.toc.append(epub.Link(href="conj.xhtml", title="Conjugations")) | |
book.spine.append((conj_chap.id, 'yes')) | |
epub.write_epub('{}.dict.epub'.format(bookname), book, {"epub3_pages": False}) | |
#os.system("ebook-convert {0}.dict.epub {0}.dict.mobi".format(bookname)) | |
print("{0} lemmas found, {1} missed".format(len(out_dict), len(misses))) | |
print("{0} conj found, {1} missed".format(len(conj_dict),len(conj_misses))) | |
print("Processed in {}s".format(int(time() - t0))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment