Last active
August 29, 2015 14:19
-
-
Save fenimore/8308657eaaf5c8817726 to your computer and use it in GitHub Desktop.
DREaM Sentiment Analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import nltk | |
import glob | |
import matplotlib.pyplot as plt | |
import re | |
from os import listdir | |
from bs4 import BeautifulSoup | |
from textblob import TextBlob | |
#this is the function to string xml tags from files | |
#beautiful soup will parse the xml | |
def strip_tags(textFile): | |
soup = BeautifulSoup(open(textFile), "lxml") | |
stripped_text = soup.get_text() | |
return stripped_text | |
#loop through the files using glob | |
#textFiles is a LIST (ie it = [1, 2, etc]) | |
#each item in the list is a document | |
textFiles = glob.glob("*.xml") | |
#RUN: | |
#for textFile in textFiles: | |
#output = [perform operation on THIS textFile] | |
#print output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bigraming = list(nltk.ngrams(textWords, 2)) | |
gramFreq = nltk.FreqDist(bigraming) | |
for words, count in gramFreq.most_common(3): | |
print(count, " ".join(list(words))) | |
%matplotlib inline | |
textText.dispersion_plot(["god"]) | |
bigraming = list(nltk.ngrams(textWords, 2)) | |
gramFreq = nltk.FreqDist(bigraming) | |
for words, count in gramFreq.most_common(3): | |
print(count, " ".join(list(words))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cleanText = strip_tags(textFile) #or for a single file "textFiles[x]" | |
corpus.append(cleanText) #gather into one variable this takes a while | |
textTokens = nltk.word_tokenize(cleanText.lower()) # tokenize | |
textWords = [word for word in textTokens if any([c for c in word if c.isalpha()])] #get words | |
textText = nltk.Text(textWords) #get nltk Text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
i = 0 | |
for textFile in textFiles: | |
cleanText = strip_tags(textFile) | |
textTokens = nltk.word_tokenize(cleanText.lower()) # tokenize | |
textWords = [word for word in textTokens if any([c for c in word if c.isalpha()])] #get words | |
textText = nltk.Text(textWords) #get nltk Text | |
textText.collocations(5) | |
print i, textFile | |
i += 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cleanText = strip_tags(textFiles[136]) | |
blob = TextBlob(cleanText) | |
print blob.sentiment.polarity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
i = 0 | |
for textFile in textFiles: | |
cleanText = strip_tags(textFile) | |
blob = TextBlob(cleanText) | |
print i, ": ", blob.sentiment.polarity, " in ", textFile | |
i += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment