Skip to content

Instantly share code, notes, and snippets.

@SigmaX
Created December 13, 2017 15:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SigmaX/2360a78e8c6a6bbc51711c2bd8e351d4 to your computer and use it in GitHub Desktop.
Save SigmaX/2360a78e8c6a6bbc51711c2bd8e351d4 to your computer and use it in GitHub Desktop.
Modified XMLCorpusReader
from __future__ import print_function, unicode_literals
import codecs
# Use the c version of ElementTree, which is faster, if possible:
try: from xml.etree import cElementTree as ElementTree
except ImportError: from xml.etree import ElementTree
from six import string_types
from nltk.data import SeekableUnicodeStreamReader
from nltk.tokenize import WordPunctTokenizer
from nltk.internals import ElementWrapper
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import *
class MyXMLCorpusReader(CorpusReader):
"""
Corpus reader for corpora whose documents are xml files.
Note that the ``XMLCorpusReader`` constructor does not take an
``encoding`` argument, because the unicode encoding is specified by
the XML files themselves. See the XML specs for more info.
"""
def __init__(self, root, fileids, wrap_etree=False):
self._wrap_etree = wrap_etree
CorpusReader.__init__(self, root, fileids)
def xml(self, fileid=None):
# Make sure we have exactly one file -- no concatenating XML.
if fileid is None and len(self._fileids) == 1:
fileid = self._fileids[0]
if not isinstance(fileid, string_types):
raise TypeError('Expected a single file identifier string')
# Read the XML in using ElementTree.
elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
# If requested, wrap it.
if self._wrap_etree:
elt = ElementWrapper(elt)
# Return the ElementTree element.
return elt
def words(self, fileid=None):
"""
Returns all of the words and punctuation symbols in the specified file
that were in text nodes -- ie, tags are ignored. Like the xml() method,
fileid can only specify one file.
:return: the given file's text nodes as a list of words and punctuation symbols
:rtype: list(str)
"""
elt = self.xml(fileid)
encoding = self.encoding(fileid)
word_tokenizer=WordPunctTokenizer()
iterator = elt.getiterator()
out = []
for node in iterator:
text = node.text
if text is not None:
if isinstance(text, bytes):
text = text.decode(encoding)
toks = word_tokenizer.tokenize(text)
out.extend(toks)
### This block is all I changed
tail = node.tail
if tail is not None:
if isinstance(tail, bytes):
tail = tail.decode(encoding)
toks = word_tokenizer.tokenize(tail)
out.extend(toks)
return out
def raw(self, fileids=None):
if fileids is None: fileids = self._fileids
elif isinstance(fileids, string_types): fileids = [fileids]
return concat([self.open(f).read() for f in fileids])
from cltk.stem.lemma import LemmaReplacer
lemmatizer = LemmaReplacer('greek')
cr = MyXMLCorpusReader("texts/", "Meditations_book1.xml")
# Retrieve all words, removing punctuation
punc = [',', '.', ';', ':', '1']
def remove_punc(s):
return ''.join([c for c in s if c not in punc]).strip()
words = [remove_punc(w) for w in cr.words() if w not in punc]
# Lemmatize
lemmas = [ w for w in words for wl in lemmatizer.lemmatize(w)]
print("{0} unique lemmas.".format(len(lemmas)))
# Frequency count
counts = { l: 0 for l in lemmas }
for l in lemmas:
counts[l] = counts[l] + 1
counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment