Last active
August 29, 2015 14:02
-
-
Save sheikholeslami/f5dc9c2f0fa06fca0528 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
به نام خدا | |
تمام فایل های لازم جهت اجرای کد در لینک های ارائه شده در فاز های قبل قابل مشاهده است. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#key phrase | |
from __future__ import unicode_literals | |
from hazm import Normalizer | |
normalizer = Normalizer() | |
from hazm import Stemmer , Lemmatizer | |
stemmer = Stemmer() | |
lemmatizer = Lemmatizer() | |
s = open("stopword.txt") | |
### | |
slist =[] | |
for line in s: | |
line=line.decode("utf8") | |
slist.append(line[0:len(line)-2]) | |
### | |
ss=open ("Symbols.txt") | |
symbols =[] | |
for line in ss: | |
line=line.decode("utf8") | |
symbols.append(line[0:len(line)-1]) | |
corpus = open("b.txt","r") | |
bodycounter=0 | |
positionlist=[] | |
text = str() | |
checker = False | |
ld=[] | |
##******************** | |
import codecs | |
mylist=[] | |
my=[] | |
faz=[] | |
faz2=[] | |
tedad=[] | |
chek=[] | |
for line in corpus: | |
line=line.decode("utf8") | |
first=line.find("<TEXT>") | |
last=line.find("</TEXT>") | |
if last != -1: | |
checker = False | |
if first != -1 and first < last :# | |
text += line[first+6:last] | |
elif first != -1 and first > last : | |
text += line[:last] | |
elif first == -1: | |
text += line[:last] | |
##************************ | |
#print text | |
wordslist =text.split() | |
for j in range(len(chek)): | |
chek[j] =0 | |
######### | |
faz=[] | |
faz2=[] | |
too = len (wordslist) | |
for term in wordslist : | |
if term not in symbols and term not in slist: | |
faz .append(term) | |
for i in range(len (faz)-1): | |
newfaz = faz[i] +" " + faz[i+1] | |
faz2 .append (newfaz) | |
#################faz2 shamele hameye dotaE haye kolle matne | |
########## | |
#for term in wordslist: | |
for term in wordslist: | |
if term not in mylist and term not in symbols and term not in slist : | |
mylist.append(term) | |
##################### | |
tedad.append(1) | |
chek.append(1) | |
else : | |
#print "booooood " | |
for i in range(len(mylist)): | |
if mylist[i] == term: | |
if chek[i] == 0: | |
tedad[i]= tedad[i] +1 | |
chek[i]=1 | |
for term in faz2: | |
########### | |
############# | |
### | |
if term not in mylist : | |
#print "term is " , term | |
####### | |
# term = stemmer.stem(term) | |
#term= lemmatizer.lemmatize(term) | |
#print term | |
####### | |
mylist.append(term) | |
##################### | |
tedad.append(1) | |
chek.append(1) | |
else : | |
#print "booooood " | |
for i in range(len(mylist)): | |
if mylist[i] == term: | |
if chek[i] == 0: | |
tedad[i]= tedad[i] +1 | |
chek[i]=1 | |
##*************************** | |
#ta Enaj yek majmooe asnad ro dar nazar migire va be ezaye tamam e kalamat va bygaram haye majmooe | |
#hesab mikone ke har kalame tooye chand ta az asnad vojood dare | |
text="" | |
position=0 | |
bodycounter += 1 | |
x=0 | |
################## | |
if checker: | |
text+=line | |
if first<>-1 and first > last: | |
text += line[first+6:] | |
checker = True | |
##************************** | |
############ | |
"""for i in range (len( mylist )): | |
print mylist[i] | |
print tedad[i] | |
print chek[i]""" | |
######### | |
#___tarif e tabe E ke ye vorooD begire va baraye kalamatesh score mohasebe kone ___ | |
n = bodycounter #tedade kolle asnad | |
#print n | |
nlist=[] | |
newterm=[] | |
flist=[] | |
newstring="" | |
########### | |
newcorpus = open("test2.txt", "r") | |
newlist=[] | |
newstring ="" | |
for i in range(1): | |
for line in newcorpus: | |
line=line.decode("utf8") | |
newstring = newstring + line | |
newlist1 =newstring.split() | |
################ | |
fazz=[] | |
fazz2=[] | |
too = len (newlist1) | |
#print "toole is : " , too | |
for term in newlist1 : | |
if term not in symbols and term not in slist: | |
fazz .append(term) | |
#print "term is " , term | |
#hameye kalamT ke stop word nistan az in sanda ro dar faz daram | |
for i in range(len (fazz)-1): | |
newfazz = fazz[i] +" " + fazz[i+1] | |
fazz2 .append (newfazz) | |
################ | |
#for ww in newlist1: | |
for ww in newlist1: | |
if (ww not in slist and ww not in symbols ): | |
#### | |
#ww=stemmer.stem(ww) | |
#print ww | |
#ww=lemmatizer.lemmatize(ww) | |
#print ww | |
#### | |
newlist.append(ww) | |
for ww in fazz2 : | |
#print "kalamte e sande jaDd : " , ww | |
if (ww not in slist and ww not in symbols ): | |
#### | |
#ww=stemmer.stem(ww) | |
#print ww | |
#ww=lemmatizer.lemmatize(ww) | |
#print ww | |
#### | |
newlist.append(ww) | |
############### | |
for term in newlist: | |
if term not in newterm: | |
newterm. append(term) | |
flist.append(1) | |
nlist.append(0) | |
else: | |
for i in range (len(newterm)): | |
if newterm[i]==term: | |
flist[i]=flist[i]+1 | |
for j in range(len(newterm)): | |
if newterm[j] not in mylist: | |
nlist[j]=0 | |
else: | |
for k in range(len(mylist)): | |
if newterm[j]==mylist[k]: | |
nlist[j]= tedad[k] | |
## ye string ro ke migire baraye hameye kalamatesh Nt o f ro hesab mikone o mirize tooye list ha | |
#ye matne vorooD migire va token hash ro miBne age tooye oon liste ghabli bashan mige ke tooye chnad ta az sanad ha hastan | |
#___________________tarife score____________________ | |
scorelist=[] | |
import math | |
maxf = max(flist) | |
for i in range(len(newterm)): | |
tf = 0.5 +( (0.5 * flist[i])/(maxf * 1.0)) | |
if nlist[i] ==0 : | |
idf =math.log(n,10) | |
else: | |
idf = math.log((n * 1.0 / nlist[i]),10) | |
score = tf * idf | |
scorelist.append(score) | |
sortlist=[] | |
for i in range(len(scorelist)): | |
newtuple=(scorelist[i],newterm[i]) | |
sortlist.append(newtuple) | |
sortlist.sort() | |
ll=len(sortlist) | |
ss=ll - 21 | |
print "**_____________*top ten *____________**" | |
for j in range(ll-1 , ss , -1): | |
print "term is :", sortlist[j][1] ," score:" , sortlist[j][0] | |
"""scoref=open ("score.txt" , "w") | |
for i in range(len(scorelist)): | |
scoref.write(newterm[i]) | |
scoref.write(":") | |
scoref.write(str(scorelist[i])) | |
scoref.write("\n") | |
scoref.close()""" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment