Skip to content

Instantly share code, notes, and snippets.

@pcasaretto
Created September 27, 2015 19:17
Show Gist options
  • Save pcasaretto/b418ec6dcdadafd2fe94 to your computer and use it in GitHub Desktop.
Save pcasaretto/b418ec6dcdadafd2fe94 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
from nltk.stem.snowball import SnowballStemmer
import string
import re
def parseOutText(f):
""" given an opened email file f, parse out all text below the
metadata block at the top
(in Part 2, you will also add stemming capabilities)
and return a string that contains all the words
in the email (space-separated)
example use case:
f = open("email_file_name.txt", "r")
text = parseOutText(f)
"""
f.seek(0) ### go back to beginning of file (annoying)
all_text = f.read()
### split off metadata
content = all_text.split("X-FileName:")
words = ""
if len(content) > 1:
### remove punctuation
text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
### project part 2: comment out the line below
words = text_string
### split the text string into individual words, stem each word,
### and append the stemmed word to words (make sure there's a single
### space between each stemmed word)
stemmer = SnowballStemmer("english")
ar_words = map(lambda x: stemmer.stem(x.strip()), text_string.split(' '))
ar_words = filter(lambda x: x != '', ar_words)
words = " ".join(ar_words)
words = re.sub(r'\s+', ' ', words)
return words
def main():
ff = open("../text_learning/test_email.txt", "r")
text = parseOutText(ff)
print text
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment