fenimore/A Set Up and Clean Corpus

## A Set Up and Clean Corpus
import os
import nltk
import glob
import matplotlib.pyplot as plt
import re
from os import listdir
from bs4 import BeautifulSoup
from textblob import TextBlob

#this is the function to string xml tags from files
#beautiful soup will parse the xml
def strip_tags(textFile):
    soup = BeautifulSoup(open(textFile), "lxml")
    stripped_text = soup.get_text()
    return stripped_text

#loop through the files using glob
#textFiles is a LIST (ie it = [1, 2, etc])
#each item in the list is a document
textFiles = glob.glob("*.xml")

#RUN:
#for textFile in textFiles:
    #output = [perform operation on THIS textFile]
    #print output

## Bigrams Matplot Implementation
    bigraming = list(nltk.ngrams(textWords, 2))
    gramFreq = nltk.FreqDist(bigraming)
    for words, count in gramFreq.most_common(3):
        print(count, " ".join(list(words)))

    %matplotlib inline
    textText.dispersion_plot(["god"])
    bigraming = list(nltk.ngrams(textWords, 2))
    gramFreq = nltk.FreqDist(bigraming)
    for words, count in gramFreq.most_common(3):
        print(count, " ".join(list(words)))

## nltk Tokenize
cleanText = strip_tags(textFile) #or for a single file "textFiles[x]"
corpus.append(cleanText) #gather into one variable this takes a while
textTokens = nltk.word_tokenize(cleanText.lower()) # tokenize
textWords = [word for word in textTokens if any([c for c in word if c.isalpha()])] #get words
textText = nltk.Text(textWords) #get nltk Text

## Print Collocations
i = 0
for textFile in textFiles:
    cleanText = strip_tags(textFile)
    textTokens = nltk.word_tokenize(cleanText.lower()) # tokenize
    textWords = [word for word in textTokens if any([c for c in word if c.isalpha()])] #get words
    textText = nltk.Text(textWords) #get nltk Text
    textText.collocations(5)
    print i, textFile
    i += 1

## TextBlob on single file
cleanText = strip_tags(textFiles[136])
blob = TextBlob(cleanText)
print blob.sentiment.polarity

## TextBlob Polarity Score
i = 0
for textFile in textFiles:
    cleanText = strip_tags(textFile)
    blob = TextBlob(cleanText)
    print i, ": ", blob.sentiment.polarity, " in ", textFile
    i += 1
	import os
	import nltk
	import glob
	import matplotlib.pyplot as plt
	import re
	from os import listdir
	from bs4 import BeautifulSoup
	from textblob import TextBlob

	#this is the function to string xml tags from files
	#beautiful soup will parse the xml
	def strip_tags(textFile):
	soup = BeautifulSoup(open(textFile), "lxml")
	stripped_text = soup.get_text()
	return stripped_text

	#loop through the files using glob
	#textFiles is a LIST (ie it = [1, 2, etc])
	#each item in the list is a document
	textFiles = glob.glob("*.xml")

	#RUN:
	#for textFile in textFiles:
	#output = [perform operation on THIS textFile]
	#print output
	bigraming = list(nltk.ngrams(textWords, 2))
	gramFreq = nltk.FreqDist(bigraming)
	for words, count in gramFreq.most_common(3):
	print(count, " ".join(list(words)))

	%matplotlib inline
	textText.dispersion_plot(["god"])
	bigraming = list(nltk.ngrams(textWords, 2))
	gramFreq = nltk.FreqDist(bigraming)
	for words, count in gramFreq.most_common(3):
	print(count, " ".join(list(words)))
	cleanText = strip_tags(textFile) #or for a single file "textFiles[x]"
	corpus.append(cleanText) #gather into one variable this takes a while
	textTokens = nltk.word_tokenize(cleanText.lower()) # tokenize
	textWords = [word for word in textTokens if any([c for c in word if c.isalpha()])] #get words
	textText = nltk.Text(textWords) #get nltk Text
	i = 0
	for textFile in textFiles:
	cleanText = strip_tags(textFile)
	textTokens = nltk.word_tokenize(cleanText.lower()) # tokenize
	textWords = [word for word in textTokens if any([c for c in word if c.isalpha()])] #get words
	textText = nltk.Text(textWords) #get nltk Text
	textText.collocations(5)
	print i, textFile
	i += 1
	cleanText = strip_tags(textFiles[136])
	blob = TextBlob(cleanText)
	print blob.sentiment.polarity