Skip to content

Instantly share code, notes, and snippets.

@waldofe
Created September 13, 2012 18:44
Show Gist options
  • Save waldofe/3716604 to your computer and use it in GitHub Desktop.
Save waldofe/3716604 to your computer and use it in GitHub Desktop.
A tool to parse eis pattern content from xml documents.
from elementtree import ElementTree as ET
from os.path import join, basename, dirname
from nltk.util import clean_html
from nltk.corpus.reader import PlaintextCorpusReader
import re
class EisParser(object):
def __init__(self, path):
self._name = basename(path)
self._path = dirname(path)
def legible_text(self):
self._raw_content_text = PlaintextCorpusReader(self._path, self._name).raw()
self._cleaned_text = clean_html(self._raw_content_text)
self.content = re.sub(r'\w+ \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2} ', '', self._cleaned_text)
return self.content
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment