Skip to content

Instantly share code, notes, and snippets.

@sheikholeslami
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sheikholeslami/f5dc9c2f0fa06fca0528 to your computer and use it in GitHub Desktop.
Save sheikholeslami/f5dc9c2f0fa06fca0528 to your computer and use it in GitHub Desktop.
به نام خدا
تمام فایل های لازم جهت اجرای کد در لینک های ارائه شده در فاز های قبل قابل مشاهده است.
#key phrase
from __future__ import unicode_literals
from hazm import Normalizer
normalizer = Normalizer()
from hazm import Stemmer , Lemmatizer
stemmer = Stemmer()
lemmatizer = Lemmatizer()
s = open("stopword.txt")
###
slist =[]
for line in s:
line=line.decode("utf8")
slist.append(line[0:len(line)-2])
###
ss=open ("Symbols.txt")
symbols =[]
for line in ss:
line=line.decode("utf8")
symbols.append(line[0:len(line)-1])
corpus = open("b.txt","r")
bodycounter=0
positionlist=[]
text = str()
checker = False
ld=[]
##********************
import codecs
mylist=[]
my=[]
faz=[]
faz2=[]
tedad=[]
chek=[]
for line in corpus:
line=line.decode("utf8")
first=line.find("<TEXT>")
last=line.find("</TEXT>")
if last != -1:
checker = False
if first != -1 and first < last :#
text += line[first+6:last]
elif first != -1 and first > last :
text += line[:last]
elif first == -1:
text += line[:last]
##************************
#print text
wordslist =text.split()
for j in range(len(chek)):
chek[j] =0
#########
faz=[]
faz2=[]
too = len (wordslist)
for term in wordslist :
if term not in symbols and term not in slist:
faz .append(term)
for i in range(len (faz)-1):
newfaz = faz[i] +" " + faz[i+1]
faz2 .append (newfaz)
#################faz2 shamele hameye dotaE haye kolle matne
##########
#for term in wordslist:
for term in wordslist:
if term not in mylist and term not in symbols and term not in slist :
mylist.append(term)
#####################
tedad.append(1)
chek.append(1)
else :
#print "booooood "
for i in range(len(mylist)):
if mylist[i] == term:
if chek[i] == 0:
tedad[i]= tedad[i] +1
chek[i]=1
for term in faz2:
###########
#############
###
if term not in mylist :
#print "term is " , term
#######
# term = stemmer.stem(term)
#term= lemmatizer.lemmatize(term)
#print term
#######
mylist.append(term)
#####################
tedad.append(1)
chek.append(1)
else :
#print "booooood "
for i in range(len(mylist)):
if mylist[i] == term:
if chek[i] == 0:
tedad[i]= tedad[i] +1
chek[i]=1
##***************************
#ta Enaj yek majmooe asnad ro dar nazar migire va be ezaye tamam e kalamat va bygaram haye majmooe
#hesab mikone ke har kalame tooye chand ta az asnad vojood dare
text=""
position=0
bodycounter += 1
x=0
##################
if checker:
text+=line
if first<>-1 and first > last:
text += line[first+6:]
checker = True
##**************************
############
"""for i in range (len( mylist )):
print mylist[i]
print tedad[i]
print chek[i]"""
#########
#___tarif e tabe E ke ye vorooD begire va baraye kalamatesh score mohasebe kone ___
n = bodycounter #tedade kolle asnad
#print n
nlist=[]
newterm=[]
flist=[]
newstring=""
###########
newcorpus = open("test2.txt", "r")
newlist=[]
newstring =""
for i in range(1):
for line in newcorpus:
line=line.decode("utf8")
newstring = newstring + line
newlist1 =newstring.split()
################
fazz=[]
fazz2=[]
too = len (newlist1)
#print "toole is : " , too
for term in newlist1 :
if term not in symbols and term not in slist:
fazz .append(term)
#print "term is " , term
#hameye kalamT ke stop word nistan az in sanda ro dar faz daram
for i in range(len (fazz)-1):
newfazz = fazz[i] +" " + fazz[i+1]
fazz2 .append (newfazz)
################
#for ww in newlist1:
for ww in newlist1:
if (ww not in slist and ww not in symbols ):
####
#ww=stemmer.stem(ww)
#print ww
#ww=lemmatizer.lemmatize(ww)
#print ww
####
newlist.append(ww)
for ww in fazz2 :
#print "kalamte e sande jaDd : " , ww
if (ww not in slist and ww not in symbols ):
####
#ww=stemmer.stem(ww)
#print ww
#ww=lemmatizer.lemmatize(ww)
#print ww
####
newlist.append(ww)
###############
for term in newlist:
if term not in newterm:
newterm. append(term)
flist.append(1)
nlist.append(0)
else:
for i in range (len(newterm)):
if newterm[i]==term:
flist[i]=flist[i]+1
for j in range(len(newterm)):
if newterm[j] not in mylist:
nlist[j]=0
else:
for k in range(len(mylist)):
if newterm[j]==mylist[k]:
nlist[j]= tedad[k]
## ye string ro ke migire baraye hameye kalamatesh Nt o f ro hesab mikone o mirize tooye list ha
#ye matne vorooD migire va token hash ro miBne age tooye oon liste ghabli bashan mige ke tooye chnad ta az sanad ha hastan
#___________________tarife score____________________
scorelist=[]
import math
maxf = max(flist)
for i in range(len(newterm)):
tf = 0.5 +( (0.5 * flist[i])/(maxf * 1.0))
if nlist[i] ==0 :
idf =math.log(n,10)
else:
idf = math.log((n * 1.0 / nlist[i]),10)
score = tf * idf
scorelist.append(score)
sortlist=[]
for i in range(len(scorelist)):
newtuple=(scorelist[i],newterm[i])
sortlist.append(newtuple)
sortlist.sort()
ll=len(sortlist)
ss=ll - 21
print "**_____________*top ten *____________**"
print
for j in range(ll-1 , ss , -1):
print "term is :", sortlist[j][1] ," score:" , sortlist[j][0]
"""scoref=open ("score.txt" , "w")
for i in range(len(scorelist)):
scoref.write(newterm[i])
scoref.write(":")
scoref.write(str(scorelist[i]))
scoref.write("\n")
scoref.close()"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment