Skip to content

Instantly share code, notes, and snippets.

@SigmaX
Created December 13, 2017 15:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SigmaX/750ad9f0d185ffdc36f1c9311d326409 to your computer and use it in GitHub Desktop.
Save SigmaX/750ad9f0d185ffdc36f1c9311d326409 to your computer and use it in GitHub Desktop.
Simple Perseus Parsing
import xmltodict
def perseus_xml_to_text(path):
'''Manually load a work, because I don't know how to do it with CLTK (see https://github.com/cltk/cltk/issues/615).'''
with open(path) as f:
doc = xmltodict.parse(f.read())
paragraphs = doc['TEI.2']['text']['body']['div1']['div2']
return [perseus_get_paragraph(p['p']) for p in paragraphs]
def perseus_get_paragraph(p_root):
if ('#text' not in p_root): # If the paragraph has no content, assume it contains a quote block
return "".join([q['#text'] for q in p_root['q']])
# XXX If paragraphs can contain both text *and* quote blocks, this might overlook some data.
return p_root['#text']
def remove_punc(s):
return ''.join([c for c in s if c not in [',', '.', ';', ':', '1']]).strip()
# Parse various books
paragraphs = perseus_xml_to_text("texts/Meditations_book1.xml") + \
perseus_xml_to_text("texts/Meditations_book2.xml") + \
perseus_xml_to_text("texts/Meditations_book3.xml") + \
perseus_xml_to_text("texts/Meditations_book4.xml") + \
perseus_xml_to_text("texts/Meditations_book5.xml") + \
perseus_xml_to_text("texts/Meditations_book6.xml") + \
perseus_xml_to_text("texts/Meditations_book7.xml") + \
perseus_xml_to_text("texts/Meditations_book8.xml") + \
perseus_xml_to_text("texts/Meditations_book9.xml") + \
perseus_xml_to_text("texts/Meditations_book10.xml") + \
perseus_xml_to_text("texts/Meditations_book11.xml") + \
perseus_xml_to_text("texts/Meditations_book12.xml")
# Lemmatize
lemmas_by_paragraph = [lemmatizer.lemmatize(p) for p in paragraphs]
lemmas = [remove_punc(l) for p in lemmas_by_paragraph for l in p]
print("{0} unique lemmas.".format(len(lemmas)))
# Frequency count
counts = { l: 0 for l in lemmas }
for l in lemmas:
counts[l] = counts[l] + 1
counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment