Created
November 8, 2022 12:29
-
-
Save ShenZhouHong/0862c64c427a7b668f01b26b43c7ce79 to your computer and use it in GitHub Desktop.
Python object representing a Perseids Treebank text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
class Text: | |
""" | |
Object representation of a Ancient Greek text formatted in the Perseids XML schema. | |
See GAGDT (Guidelines for the Ancient Greek Dependecy Treebank) 2.0 for schema | |
information and additional metadata. | |
""" | |
def __init__(self, path: str) -> None: | |
self.path: str = path | |
self.tree: ET.ElementTree = ET.parse(self.path) | |
self.root: ET.Element = self.tree.getroot() | |
self.body: ET.Element = self.tree.find("body") | |
def count_sentences(self) -> int: | |
sentences: list[ET.Element] = self.body.findall("sentence") | |
sentence_count: int = len(sentences) | |
return sentence_count | |
def count_words(self) -> int: | |
word_count: int = 0 | |
# Sum up all the words in every sentence in the document | |
for sentence in self.body.findall("sentence"): | |
words: list[ET.Element] = sentence.findall("word") | |
word_count += len(words) | |
return word_count |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment