Created
December 13, 2017 15:54
-
-
Save SigmaX/750ad9f0d185ffdc36f1c9311d326409 to your computer and use it in GitHub Desktop.
Simple Perseus Parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xmltodict | |
def perseus_xml_to_text(path): | |
'''Manually load a work, because I don't know how to do it with CLTK (see https://github.com/cltk/cltk/issues/615).''' | |
with open(path) as f: | |
doc = xmltodict.parse(f.read()) | |
paragraphs = doc['TEI.2']['text']['body']['div1']['div2'] | |
return [perseus_get_paragraph(p['p']) for p in paragraphs] | |
def perseus_get_paragraph(p_root): | |
if ('#text' not in p_root): # If the paragraph has no content, assume it contains a quote block | |
return "".join([q['#text'] for q in p_root['q']]) | |
# XXX If paragraphs can contain both text *and* quote blocks, this might overlook some data. | |
return p_root['#text'] | |
def remove_punc(s): | |
return ''.join([c for c in s if c not in [',', '.', ';', ':', '1']]).strip() | |
# Parse various books | |
paragraphs = perseus_xml_to_text("texts/Meditations_book1.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book2.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book3.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book4.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book5.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book6.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book7.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book8.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book9.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book10.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book11.xml") + \ | |
perseus_xml_to_text("texts/Meditations_book12.xml") | |
# Lemmatize | |
lemmas_by_paragraph = [lemmatizer.lemmatize(p) for p in paragraphs] | |
lemmas = [remove_punc(l) for p in lemmas_by_paragraph for l in p] | |
print("{0} unique lemmas.".format(len(lemmas))) | |
# Frequency count | |
counts = { l: 0 for l in lemmas } | |
for l in lemmas: | |
counts[l] = counts[l] + 1 | |
counts = sorted(counts.items(), key=lambda x: x[1], reverse=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment