Skip to content

Instantly share code, notes, and snippets.

@fenimore
Last active August 29, 2015 14:19
Show Gist options
  • Save fenimore/8308657eaaf5c8817726 to your computer and use it in GitHub Desktop.
Save fenimore/8308657eaaf5c8817726 to your computer and use it in GitHub Desktop.
DREaM Sentiment Analysis
import os
import nltk
import glob
import matplotlib.pyplot as plt
import re
from os import listdir
from bs4 import BeautifulSoup
from textblob import TextBlob
#this is the function to string xml tags from files
#beautiful soup will parse the xml
def strip_tags(textFile):
soup = BeautifulSoup(open(textFile), "lxml")
stripped_text = soup.get_text()
return stripped_text
#loop through the files using glob
#textFiles is a LIST (ie it = [1, 2, etc])
#each item in the list is a document
textFiles = glob.glob("*.xml")
#RUN:
#for textFile in textFiles:
#output = [perform operation on THIS textFile]
#print output
bigraming = list(nltk.ngrams(textWords, 2))
gramFreq = nltk.FreqDist(bigraming)
for words, count in gramFreq.most_common(3):
print(count, " ".join(list(words)))
%matplotlib inline
textText.dispersion_plot(["god"])
bigraming = list(nltk.ngrams(textWords, 2))
gramFreq = nltk.FreqDist(bigraming)
for words, count in gramFreq.most_common(3):
print(count, " ".join(list(words)))
cleanText = strip_tags(textFile) #or for a single file "textFiles[x]"
corpus.append(cleanText) #gather into one variable this takes a while
textTokens = nltk.word_tokenize(cleanText.lower()) # tokenize
textWords = [word for word in textTokens if any([c for c in word if c.isalpha()])] #get words
textText = nltk.Text(textWords) #get nltk Text
i = 0
for textFile in textFiles:
cleanText = strip_tags(textFile)
textTokens = nltk.word_tokenize(cleanText.lower()) # tokenize
textWords = [word for word in textTokens if any([c for c in word if c.isalpha()])] #get words
textText = nltk.Text(textWords) #get nltk Text
textText.collocations(5)
print i, textFile
i += 1
cleanText = strip_tags(textFiles[136])
blob = TextBlob(cleanText)
print blob.sentiment.polarity
i = 0
for textFile in textFiles:
cleanText = strip_tags(textFile)
blob = TextBlob(cleanText)
print i, ": ", blob.sentiment.polarity, " in ", textFile
i += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment