Created
December 13, 2017 15:52
-
-
Save SigmaX/2360a78e8c6a6bbc51711c2bd8e351d4 to your computer and use it in GitHub Desktop.
Modified XMLCorpusReader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function, unicode_literals | |
import codecs | |
# Use the c version of ElementTree, which is faster, if possible: | |
try: from xml.etree import cElementTree as ElementTree | |
except ImportError: from xml.etree import ElementTree | |
from six import string_types | |
from nltk.data import SeekableUnicodeStreamReader | |
from nltk.tokenize import WordPunctTokenizer | |
from nltk.internals import ElementWrapper | |
from nltk.corpus.reader.api import CorpusReader | |
from nltk.corpus.reader.util import * | |
class MyXMLCorpusReader(CorpusReader): | |
""" | |
Corpus reader for corpora whose documents are xml files. | |
Note that the ``XMLCorpusReader`` constructor does not take an | |
``encoding`` argument, because the unicode encoding is specified by | |
the XML files themselves. See the XML specs for more info. | |
""" | |
def __init__(self, root, fileids, wrap_etree=False): | |
self._wrap_etree = wrap_etree | |
CorpusReader.__init__(self, root, fileids) | |
def xml(self, fileid=None): | |
# Make sure we have exactly one file -- no concatenating XML. | |
if fileid is None and len(self._fileids) == 1: | |
fileid = self._fileids[0] | |
if not isinstance(fileid, string_types): | |
raise TypeError('Expected a single file identifier string') | |
# Read the XML in using ElementTree. | |
elt = ElementTree.parse(self.abspath(fileid).open()).getroot() | |
# If requested, wrap it. | |
if self._wrap_etree: | |
elt = ElementWrapper(elt) | |
# Return the ElementTree element. | |
return elt | |
def words(self, fileid=None): | |
""" | |
Returns all of the words and punctuation symbols in the specified file | |
that were in text nodes -- ie, tags are ignored. Like the xml() method, | |
fileid can only specify one file. | |
:return: the given file's text nodes as a list of words and punctuation symbols | |
:rtype: list(str) | |
""" | |
elt = self.xml(fileid) | |
encoding = self.encoding(fileid) | |
word_tokenizer=WordPunctTokenizer() | |
iterator = elt.getiterator() | |
out = [] | |
for node in iterator: | |
text = node.text | |
if text is not None: | |
if isinstance(text, bytes): | |
text = text.decode(encoding) | |
toks = word_tokenizer.tokenize(text) | |
out.extend(toks) | |
### This block is all I changed | |
tail = node.tail | |
if tail is not None: | |
if isinstance(tail, bytes): | |
tail = tail.decode(encoding) | |
toks = word_tokenizer.tokenize(tail) | |
out.extend(toks) | |
return out | |
def raw(self, fileids=None): | |
if fileids is None: fileids = self._fileids | |
elif isinstance(fileids, string_types): fileids = [fileids] | |
return concat([self.open(f).read() for f in fileids]) | |
from cltk.stem.lemma import LemmaReplacer | |
lemmatizer = LemmaReplacer('greek') | |
cr = MyXMLCorpusReader("texts/", "Meditations_book1.xml") | |
# Retrieve all words, removing punctuation | |
punc = [',', '.', ';', ':', '1'] | |
def remove_punc(s): | |
return ''.join([c for c in s if c not in punc]).strip() | |
words = [remove_punc(w) for w in cr.words() if w not in punc] | |
# Lemmatize | |
lemmas = [ w for w in words for wl in lemmatizer.lemmatize(w)] | |
print("{0} unique lemmas.".format(len(lemmas))) | |
# Frequency count | |
counts = { l: 0 for l in lemmas } | |
for l in lemmas: | |
counts[l] = counts[l] + 1 | |
counts = sorted(counts.items(), key=lambda x: x[1], reverse=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment