Skip to content

Instantly share code, notes, and snippets.

@zelinskiy
Last active September 6, 2019 14:30
Show Gist options
  • Save zelinskiy/53cf16a04786fe133facbacb6f47cb56 to your computer and use it in GitHub Desktop.
Save zelinskiy/53cf16a04786fe133facbacb6f47cb56 to your computer and use it in GitHub Desktop.
from cltk.corpus.utils.importer import CorpusImporter
from cltk.stem.lemma import LemmaReplacer
from cltk.tokenize.latin.sentence import SentenceTokenizer
from nltk.tokenize.punkt import PunktLanguageVars
from pystardict import Dictionary
from open_words.parse import Parse
import cltk.prosody.latin.string_utils as string_utils
from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.tag.pos import POSTag
from bs4 import BeautifulSoup
import ebooklib
from ebooklib import epub
from ebooklib.utils import parse_html_string
import random as rand
from datetime import datetime
import shutil
import os
import sys
from glob import glob
import re
from time import time, strftime
# TODO:
# Split dictionaries in smaller files
# Abbreviations version of postag_to_str
# Web app
# CLI
# "See *ref*" in Lewis
# Get texts from ltin library
# Browser extension
MAX_SYMBOLS = 1000
USE_WHITAKER = False
STOPS = ['.', ',', '?', '', '[', ']', ';', ':', '!', '(', ')']
DEIMITATIONE="Qui sequitur me non ambulat in tenebris dicit Dominus. Hæc sunt verba Christi, quibus admonemur quatenus vitam eius et mores imitemur, si volumus veraciter illuminari, et ab omni cæcitate cordis liberari. Summum igitur studium nostrum, sit in vita Jesu meditari."
DICTIONARIZE = True
GRAMMARIZE = True
def postag(taggers, sentence):
outs = []
for tagger in taggers:
outs.append(tagger(sentence))
res = outs[0]
for i, (w, tag) in enumerate(res):
if tag in ['Unk', None]:
for out in outs[1:]:
if i < len(out) and out[i] not in ['Unk', None]:
res[i] = w, out[i][1]
break
return res
def postag_to_str(tag_):
if tag_ in ['Unk', None]:
return None
else:
tag = tag_.lower()
ret = []
pars = tag[0]
if pars == 'n':
ret.append("Nomen")
elif pars == 'v':
ret.append("Verbum")
elif pars == 't':
ret.append("Participium")
elif pars == 'a':
ret.append("Adiectivum")
elif pars == 'd':
ret.append("Adverbium")
elif pars == 'c':
ret.append("Coniunctio")
elif pars == 'r':
ret.append("Praepositio")
elif pars == 'p':
ret.append("Pronomen")
elif pars == 'm':
ret.append("Numerale")
elif pars == 'i':
ret.append("Interjectio")
elif pars == 'e':
ret.append("Exclamatio")
elif pars == 'u':
ret.append("Interpunctum")
persona = tag[1]
if persona == '1':
ret.append("1 persona")
elif persona == '2':
ret.append("2 persona")
elif persona == '3':
ret.append("3 persona")
numerus = tag[2]
if numerus == 's':
ret.append("Singularis")
elif numerus == 'p':
ret.append("Pluralis")
tempus = tag[3]
if tempus == 'p':
ret.append("Praesens")
elif tempus == 'i':
ret.append("Imperfectum")
elif tempus == 'r':
ret.append("Perfectum")
elif tempus == 'l':
ret.append("Plusquamperfectum")
elif tempus == 't':
ret.append("Futurum exactum")
elif tempus == 'f':
ret.append("Futurum")
modus = tag[4]
if modus == 'i':
ret.append("Indicativus")
elif modus == 's':
ret.append("Coniunctivus")
elif modus == 'n':
ret.append("Infinitivus")
elif modus == 'm':
ret.append("Imperativus")
elif modus == 'p':
ret.append("Participium")
elif modus == 'd':
ret.append("Gerunduim")
elif modus == 'g':
ret.append("Gerundivum")
elif modus == 'u':
ret.append("Supinum")
genus_verbi = tag[5]
if genus_verbi == 'a':
ret.append("Activum")
elif genus_verbi == 'p':
ret.append("Passivum")
genus = tag[6]
if genus == 'm':
ret.append("Masculinum")
elif genus == 'f':
ret.append("Femininum")
elif genus == 'n':
ret.append("Neutrum")
casus = tag[7]
if casus == 'n':
ret.append("Nominativus")
elif casus == 'g':
ret.append("Genetivus")
elif casus == 'd':
ret.append("Dativus")
elif casus == 'a':
ret.append("Accusativus")
elif casus == 'v':
ret.append("Vocativus")
elif casus == 'b':
ret.append("Ablativus")
elif casus == 'l':
ret.append("Locativus")
comparatio = tag[8]
if comparatio == 'c':
ret.append("Comparativus")
if comparatio == 's':
ret.append("Superlativus")
return " ".join(ret)
'''
ci = CorpusImporter('latin')
corpora = [#'latin_text_perseus',
#'latin_treebank_perseus',
#'latin_text_latin_library',
#'phi5',
#'phi7',
#'latin_proper_names_cltk',
#'latin_models_cltk',
#'latin_pos_lemmata_cltk',
'latin_treebank_index_thomisticus',
#'latin_lexica_perseus',
#'latin_training_set_sentence_cltk',
#'latin_word2vec_cltk',
#'latin_text_antique_digiliblt',
#'latin_text_corpus_grammaticorum_latinorum',
#'latin_text_poeti_ditalia',
#'latin_text_tesserae']
for cn in corpora:
ci.import_corpus(cn)
print("{} done".format(cn))
'''
t0 = time()
lemmatizer = LemmaReplacer('latin')
b_lemmatizer = BackoffLatinLemmatizer()
tagger = POSTag('latin')
taggers = [
tagger.tag_ngram_123_backoff,
# tagger.tag_tnt,
# tagger.tag_crf,
# tagger.tag_unigram
]
sentence_tokenizer = SentenceTokenizer()
word_tokenizer = PunktLanguageVars()
rus_dictionary = Dictionary('/home/nic/Desktop/latrus/latrus')
eng_dictionary = Dictionary('/home/nic/Desktop/Lewis1890/Lewis1890')
dicts = [rus_dictionary, eng_dictionary]
whitaker = Parse()
out_dict = {}
conj_dict = {}
misses = set()
conj_misses = set()
regex = r'\W+|\d+'
bookfile = sys.argv[1]
bookname = bookfile.replace('.epub', '')
book = epub.read_epub(bookfile)
for text_item in list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT)):
print("Processing {}".format(text_item.file_name))
soup = BeautifulSoup(text_item.get_content(), "lxml")
for element in soup.body.find_all(text=True):
sentences = sentence_tokenizer.tokenize(element.string)
res = []
for (i, s) in enumerate(sentences):
#ws = s.split() #word_tokenizer.word_tokenize(s)
tagged_ws = postag(taggers, s)
for (j, (w, tag)) in enumerate(tagged_ws):
w_ = w.lower().replace("æ", "ae").replace("œ", "oe").replace("j", "i")
w_ = re.sub(regex, '', w_)
if w_ in STOPS:
res.append(w)
else:
if DICTIONARIZE:
lemma = lemmatizer.lemmatize([w_])[0]
lemma = re.sub(regex, '', lemma)
found = False
if lemma not in out_dict:
for d in dicts:
if not found and lemma in d:
sense = BeautifulSoup(d[lemma], "html.parser").get_text()
sense = sense.replace("\n", " ")
out_dict[lemma] = sense[:MAX_SYMBOLS].replace(lemma, lemma[0] + ".")
found = True
else:
found = True
if not found:
misses.add(w)
else:
found = False
if GRAMMARIZE:
if w_ not in conj_dict:
conj_data = postag_to_str(tag)
conjugated = conj_data is not None
if conjugated:
conj_dict[w_] = (lemma if found else None, conj_data)
else:
conjugated = True
if not conjugated:
conj_misses.add(w_)
else:
conjugated = False
if conjugated:
res.append(' <a href="conj.xhtml#{1}">{0}</a> '.format(w, w_))
elif found:
res.append(' <a href="dict.xhtml#{1}">{0}</a> '.format(w, lemma))
else:
res.append(" " + w)
element.replace_with(BeautifulSoup("".join(res), "html.parser"))
chapter_str = soup.prettify(formatter='lxml')
text_item.set_content(chapter_str.encode('utf-8'))
soup = BeautifulSoup('<html><head><title>Dictionary</title></head><body></body></html>', 'html.parser')
for k in sorted(out_dict):
p = soup.new_tag('p', id=k)
innerHTML = '<b>{0}</b> {1}'.format(k, out_dict[k])
p.append(BeautifulSoup(innerHTML, 'html.parser'))
soup.html.body.append(p)
dict_chap = epub.EpubHtml(title='Dictionary', file_name='dict.xhtml', lang='en')
dict_chap.content = soup.prettify(formatter='lxml')
dict_chap.id = 'dict'
book.add_item(dict_chap)
book.toc.append(epub.Link(href="dict.xhtml", title="Dictionary"))
book.spine.append((dict_chap.id, 'yes'))
soup = BeautifulSoup('<html><head><title>Conjugations</title></head><body></body></html>', 'html.parser')
for k in sorted(conj_dict):
p = soup.new_tag('p', id=k)
lemma, conj = conj_dict[k]
if lemma is None:
dict_link = "?"
else:
dict_link = '<a href="dict.xhtml#{0}">{0}</a>'.format(lemma)
innerHTML = '<b>{0}</b> {1} {2}'.format(k, dict_link, conj)
p.append(BeautifulSoup(innerHTML, 'html.parser'))
soup.html.body.append(p)
conj_chap = epub.EpubHtml(title='Conjugations', file_name='conj.xhtml', lang='en')
conj_chap.content = soup.prettify(formatter='lxml')
conj_chap.id = 'conj'
book.add_item(conj_chap)
book.toc.append(epub.Link(href="conj.xhtml", title="Conjugations"))
book.spine.append((conj_chap.id, 'yes'))
epub.write_epub('{}.dict.epub'.format(bookname), book, {"epub3_pages": False})
#os.system("ebook-convert {0}.dict.epub {0}.dict.mobi".format(bookname))
print("{0} lemmas found, {1} missed".format(len(out_dict), len(misses)))
print("{0} conj found, {1} missed".format(len(conj_dict),len(conj_misses)))
print("Processed in {}s".format(int(time() - t0)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment