Created
September 27, 2015 19:17
-
-
Save pcasaretto/b418ec6dcdadafd2fe94 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from nltk.stem.snowball import SnowballStemmer | |
import string | |
import re | |
def parseOutText(f): | |
""" given an opened email file f, parse out all text below the | |
metadata block at the top | |
(in Part 2, you will also add stemming capabilities) | |
and return a string that contains all the words | |
in the email (space-separated) | |
example use case: | |
f = open("email_file_name.txt", "r") | |
text = parseOutText(f) | |
""" | |
f.seek(0) ### go back to beginning of file (annoying) | |
all_text = f.read() | |
### split off metadata | |
content = all_text.split("X-FileName:") | |
words = "" | |
if len(content) > 1: | |
### remove punctuation | |
text_string = content[1].translate(string.maketrans("", ""), string.punctuation) | |
### project part 2: comment out the line below | |
words = text_string | |
### split the text string into individual words, stem each word, | |
### and append the stemmed word to words (make sure there's a single | |
### space between each stemmed word) | |
stemmer = SnowballStemmer("english") | |
ar_words = map(lambda x: stemmer.stem(x.strip()), text_string.split(' ')) | |
ar_words = filter(lambda x: x != '', ar_words) | |
words = " ".join(ar_words) | |
words = re.sub(r'\s+', ' ', words) | |
return words | |
def main(): | |
ff = open("../text_learning/test_email.txt", "r") | |
text = parseOutText(ff) | |
print text | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment