SigmaX/MyXMLCorpusReadr.py

## MyXMLCorpusReadr.py
from __future__ import print_function, unicode_literals

import codecs

# Use the c version of ElementTree, which is faster, if possible:
try: from xml.etree import cElementTree as ElementTree
except ImportError: from xml.etree import ElementTree

from six import string_types

from nltk.data import SeekableUnicodeStreamReader
from nltk.tokenize import WordPunctTokenizer
from nltk.internals import ElementWrapper

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import *

class MyXMLCorpusReader(CorpusReader):
    """
    Corpus reader for corpora whose documents are xml files.

    Note that the ``XMLCorpusReader`` constructor does not take an
    ``encoding`` argument, because the unicode encoding is specified by
    the XML files themselves.  See the XML specs for more info.
    """
    def __init__(self, root, fileids, wrap_etree=False):
        self._wrap_etree = wrap_etree
        CorpusReader.__init__(self, root, fileids)

    def xml(self, fileid=None):
        # Make sure we have exactly one file -- no concatenating XML.
        if fileid is None and len(self._fileids) == 1:
            fileid = self._fileids[0]
        if not isinstance(fileid, string_types):
            raise TypeError('Expected a single file identifier string')
        # Read the XML in using ElementTree.
        elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
        # If requested, wrap it.
        if self._wrap_etree:
            elt = ElementWrapper(elt)
        # Return the ElementTree element.
        return elt

    def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
            ### This block is all I changed
            tail = node.tail
            if tail is not None:
                if isinstance(tail, bytes):
                    tail = tail.decode(encoding)
                toks = word_tokenizer.tokenize(tail)
                out.extend(toks)
        return out

    def raw(self, fileids=None):
        if fileids is None: fileids = self._fileids
        elif isinstance(fileids, string_types): fileids = [fileids]
        return concat([self.open(f).read() for f in fileids])


from cltk.stem.lemma import LemmaReplacer
lemmatizer = LemmaReplacer('greek')

cr = MyXMLCorpusReader("texts/", "Meditations_book1.xml")

# Retrieve all words, removing punctuation
punc = [',', '.', ';', ':', '1']
def remove_punc(s):
    return ''.join([c for c in s if c not in punc]).strip()
words = [remove_punc(w) for w in cr.words() if w not in punc]

# Lemmatize
lemmas = [ w for w in words for wl in lemmatizer.lemmatize(w)]
print("{0} unique lemmas.".format(len(lemmas)))

# Frequency count
counts = { l: 0 for l in lemmas }
for l in lemmas:
    counts[l] = counts[l] + 1
counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
	from __future__ import print_function, unicode_literals

	import codecs

	# Use the c version of ElementTree, which is faster, if possible:
	try: from xml.etree import cElementTree as ElementTree
	except ImportError: from xml.etree import ElementTree

	from six import string_types

	from nltk.data import SeekableUnicodeStreamReader
	from nltk.tokenize import WordPunctTokenizer
	from nltk.internals import ElementWrapper

	from nltk.corpus.reader.api import CorpusReader
	from nltk.corpus.reader.util import *

	class MyXMLCorpusReader(CorpusReader):
	"""
	Corpus reader for corpora whose documents are xml files.

	Note that the ``XMLCorpusReader`` constructor does not take an
	``encoding`` argument, because the unicode encoding is specified by
	the XML files themselves. See the XML specs for more info.
	"""
	def __init__(self, root, fileids, wrap_etree=False):
	self._wrap_etree = wrap_etree
	CorpusReader.__init__(self, root, fileids)

	def xml(self, fileid=None):
	# Make sure we have exactly one file -- no concatenating XML.
	if fileid is None and len(self._fileids) == 1:
	fileid = self._fileids[0]
	if not isinstance(fileid, string_types):
	raise TypeError('Expected a single file identifier string')
	# Read the XML in using ElementTree.
	elt = ElementTree.parse(self.abspath(fileid).open()).getroot()
	# If requested, wrap it.
	if self._wrap_etree:
	elt = ElementWrapper(elt)
	# Return the ElementTree element.
	return elt

	def words(self, fileid=None):
	"""
	Returns all of the words and punctuation symbols in the specified file
	that were in text nodes -- ie, tags are ignored. Like the xml() method,
	fileid can only specify one file.

	:return: the given file's text nodes as a list of words and punctuation symbols
	:rtype: list(str)
	"""

	elt = self.xml(fileid)
	encoding = self.encoding(fileid)
	word_tokenizer=WordPunctTokenizer()
	iterator = elt.getiterator()
	out = []

	for node in iterator:
	text = node.text
	if text is not None:
	if isinstance(text, bytes):
	text = text.decode(encoding)
	toks = word_tokenizer.tokenize(text)
	out.extend(toks)
	### This block is all I changed
	tail = node.tail
	if tail is not None:
	if isinstance(tail, bytes):
	tail = tail.decode(encoding)
	toks = word_tokenizer.tokenize(tail)
	out.extend(toks)
	return out

	def raw(self, fileids=None):
	if fileids is None: fileids = self._fileids
	elif isinstance(fileids, string_types): fileids = [fileids]
	return concat([self.open(f).read() for f in fileids])


	from cltk.stem.lemma import LemmaReplacer
	lemmatizer = LemmaReplacer('greek')

	cr = MyXMLCorpusReader("texts/", "Meditations_book1.xml")

	# Retrieve all words, removing punctuation
	punc = [',', '.', ';', ':', '1']
	def remove_punc(s):
	return ''.join([c for c in s if c not in punc]).strip()
	words = [remove_punc(w) for w in cr.words() if w not in punc]

	# Lemmatize
	lemmas = [ w for w in words for wl in lemmatizer.lemmatize(w)]
	print("{0} unique lemmas.".format(len(lemmas)))

	# Frequency count
	counts = { l: 0 for l in lemmas }
	for l in lemmas:
	counts[l] = counts[l] + 1
	counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)