Created
December 31, 2011 18:47
-
-
Save mallamanis/1544921 to your computer and use it in GitHub Desktop.
Text-tilling implementation with nltk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import nltk | |
from nltk.stem.porter import PorterStemmer | |
def preprocessText(text): | |
# To lower case | |
text = text.lower(); | |
# Tokenize text | |
tokenizedText = nltk.wordpunct_tokenize(text); | |
tokenizedText = [w for w in tokenizedText if not w in (',' , '.' , '-' , '(' , ')' , '"' , "'" , ').' , '),' , '/' , ';' , ':' )] | |
return tokenizedText | |
def preprocessPseudosentences(pseudosentences): | |
processedSentences = [] | |
for pseudoSentence in pseudosentences: | |
processedSentences.append(preprocessPseudosentence(pseudoSentence)) | |
return processedSentences | |
def preprocessPseudosentence(tokenizedText): | |
# Remove Stopwords | |
from nltk.corpus import stopwords | |
filtered_words = [w for w in tokenizedText if not w in stopwords.words('english')] | |
# Lemmatization Text | |
from nltk.stem.wordnet import WordNetLemmatizer | |
lemmatizedText = [] | |
for word in filtered_words: | |
lemmatizedText.append( WordNetLemmatizer().lemmatize(word)); | |
return lemmatizedText | |
def createPseudosentences(text, width=20): | |
start = 0 | |
pseudosentences = [] | |
while start < len(text): | |
pseudosentences.append(text[start:start+width]) | |
start += width | |
return pseudosentences | |
def blockScores(pseudosentences, printOutput = False): | |
import math | |
start = 0 | |
scores = [] | |
while start < len(pseudosentences)-4: | |
block1 = list(pseudosentences[start]); | |
block1.extend(pseudosentences[start+1]); | |
block2 = list(pseudosentences[start+2]); | |
block2.extend(pseudosentences[start+3]); | |
start+=1 | |
if printOutput: print block1 | |
if printOutput: print block2 | |
# Block similarity metric | |
terms = list(block1); | |
terms.extend(block2); | |
terms = list(set(terms)); | |
similarity = 0; | |
w1 = 0; | |
w2 = 0; | |
for term in terms: | |
wtb1 = 0 | |
wtb2 = 0 | |
for word in block1: | |
if term == word: | |
wtb1 += 1 | |
for word in block2: | |
if word==term: | |
wtb2 += 1 | |
similarity += wtb1 * wtb2 | |
w1 += wtb1 * wtb1 | |
w2 += wtb2 * wtb2 | |
if (wtb1 * wtb2 > 0): | |
if printOutput: print 'Common term "'+ term +'" wtb1 ='+str(wtb1)+' wtb2='+str(wtb2) | |
score = similarity / math.sqrt((float) (w1 * w2)) | |
scores.append(score) | |
if printOutput: print "Score:"+str(score) | |
return scores | |
def vocabularyIntrodctions(pseudosentences, width = 20): | |
""" | |
Calculate vocabulary introductions scores | |
""" | |
from sets import Set | |
# First get for each pseudosentence the set of new terms they introduce | |
seenWords = Set() | |
pseudosentence_new_terms = [] | |
for sentence in pseudosentences: | |
newWords = Set() | |
for word in sentence: | |
if not (word in seenWords): | |
newWords.add(word) | |
seenWords.add(word) | |
print newWords | |
pseudosentence_new_terms.append(newWords) | |
start = 0; | |
scores = [] | |
while start < (len(pseudosentences) - 2): | |
block1newTerms = len(pseudosentence_new_terms[start]) | |
block2newTerms = len(pseudosentence_new_terms[start+1]) | |
print "new terms in b1="+str(block1newTerms)+" in b2="+str(block2newTerms); | |
score = (block1newTerms + block2newTerms) / (2. * width) | |
print "Score: "+str(score) | |
scores.append(score) | |
start += 1 | |
return scores | |
def calculateDepth(scores): | |
depthScores = [] | |
scIndex = 0; | |
while scIndex < len(scores) -3: | |
score = scores[scIndex] - 2*scores[scIndex+1] + scores[scIndex+2] | |
depthScores.append(score) | |
scIndex +=1 | |
mean = sum(depthScores)/len(depthScores) | |
print 'Mean:'+str(mean) | |
return depthScores | |
# Retrieve text | |
f = open("asteroid.txt","r") | |
text = f.read(); | |
f.close() | |
preprocessedText = preprocessText(text); | |
# Pseudosentences | |
pseudosentences = createPseudosentences(preprocessedText) | |
pseudosentences = preprocessPseudosentences(pseudosentences) | |
scores = blockScores(pseudosentences, printOutput = True) | |
print scores | |
#Depth scores | |
depthScores = calculateDepth(scores) | |
print depthScores | |
#print vocabularyIntrodctions(pseudosentences) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment