sheikholeslami/readme.txt

## readme.txt
به نام خدا

تمام فایل های لازم جهت اجرای کد در لینک های ارائه شده در فاز های قبل قابل مشاهده است.

## sheikholeslami.py
#key phrase

from __future__ import unicode_literals
from hazm import Normalizer
normalizer = Normalizer()
from hazm import Stemmer , Lemmatizer
stemmer = Stemmer()
lemmatizer = Lemmatizer()

s = open("stopword.txt")
###

slist =[]
for line in s:
    line=line.decode("utf8")
    slist.append(line[0:len(line)-2])

###

ss=open ("Symbols.txt")
symbols =[]
for line in ss:
    line=line.decode("utf8")
    symbols.append(line[0:len(line)-1])


corpus = open("b.txt","r")

bodycounter=0
positionlist=[]
text = str()
checker = False
ld=[]


##********************
import codecs


mylist=[]
my=[]
faz=[]
faz2=[]
tedad=[]
chek=[]
for line in corpus:
    line=line.decode("utf8")
    first=line.find("<TEXT>")
    last=line.find("</TEXT>")
    if last != -1:
        checker = False
        if first != -1 and first < last :#
            text += line[first+6:last]
        elif first != -1 and first > last :
            text += line[:last]
        elif first == -1:
            text += line[:last]

##************************
        #print text
        wordslist =text.split()

        for j in range(len(chek)):
            chek[j] =0
        #########

        faz=[]
        faz2=[]
        too = len (wordslist)
        for term in wordslist :
            if term not in symbols and term not in slist:
                faz .append(term)
        for i in range(len (faz)-1):
            newfaz = faz[i] +" " + faz[i+1]
            faz2 .append (newfaz)

        #################faz2 shamele hameye dotaE haye kolle matne

        ##########
        #for term in wordslist:
        for term in wordslist:
            if term not in mylist and term not in symbols and term not in slist :

                mylist.append(term)


                #####################
                tedad.append(1)
                chek.append(1)
            else :
                #print "booooood "
                for i in range(len(mylist)):
                    if mylist[i] == term:
                        if chek[i] == 0:
                            tedad[i]= tedad[i] +1
                            chek[i]=1

        for term in faz2:
            ###########


            #############
###


            if term not in mylist :
                #print "term  is " , term
                #######
               # term = stemmer.stem(term)
                #term= lemmatizer.lemmatize(term)
                #print term
                #######
                mylist.append(term)


                #####################
                tedad.append(1)
                chek.append(1)
            else :
                #print "booooood "
                for i in range(len(mylist)):
                    if mylist[i] == term:
                        if chek[i] == 0:
                            tedad[i]= tedad[i] +1
                            chek[i]=1


##***************************

                 #ta Enaj yek majmooe asnad ro dar nazar  migire va be ezaye tamam e kalamat va bygaram haye  majmooe
                 #hesab mikone ke har kalame tooye chand ta az asnad vojood dare


        text=""
        position=0
        bodycounter += 1

        x=0

##################


    if checker:
        text+=line
    if first<>-1 and first > last:
        text += line[first+6:]
        checker = True
##**************************


############
"""for i in range (len( mylist )):
    print mylist[i]
    print tedad[i]
    print chek[i]"""

        #########

#___tarif e tabe E ke ye vorooD begire va baraye kalamatesh score mohasebe kone ___


n = bodycounter #tedade kolle asnad
#print n
nlist=[]
newterm=[]
flist=[]
newstring=""
###########

newcorpus = open("test2.txt", "r")


newlist=[]
newstring =""
for i in range(1):
    for line in newcorpus:
        line=line.decode("utf8")
        newstring = newstring + line

        newlist1 =newstring.split()


        ################
        fazz=[]
        fazz2=[]
        too = len (newlist1)
        #print "toole is : " , too
        for term in newlist1 :
            if term not in symbols and term not in slist:
                fazz .append(term)
                #print "term is " , term
                #hameye kalamT ke stop word nistan az in sanda ro dar faz daram
        for i in range(len (fazz)-1):
            newfazz = fazz[i] +" " + fazz[i+1]
            fazz2 .append (newfazz)

        ################
        #for ww in newlist1:
        for ww in newlist1:
            if (ww not in slist and ww not in symbols ):
                ####
                #ww=stemmer.stem(ww)
                #print ww
                #ww=lemmatizer.lemmatize(ww)
                #print ww
                ####
                newlist.append(ww)

        for ww in fazz2 :
            #print "kalamte e sande jaDd : " , ww
            if (ww not in slist and ww not in symbols ):
                ####
                #ww=stemmer.stem(ww)
                #print ww
                #ww=lemmatizer.lemmatize(ww)
                #print ww
                ####
                newlist.append(ww)

###############
        for term in newlist:
            if term not in newterm:
                newterm. append(term)
                flist.append(1)
                nlist.append(0)
            else:
                for i in range (len(newterm)):
                    if newterm[i]==term:
                        flist[i]=flist[i]+1

        for j in range(len(newterm)):
            if newterm[j] not in mylist:
                nlist[j]=0
            else:
                for k in range(len(mylist)):
                    if newterm[j]==mylist[k]:
                        nlist[j]= tedad[k]


## ye string ro ke migire baraye hameye kalamatesh Nt o f ro hesab mikone o mirize tooye list ha


#ye matne vorooD migire va token hash ro miBne age tooye oon liste ghabli bashan mige ke tooye chnad ta az sanad  ha hastan


    #___________________tarife score____________________

scorelist=[]
import math
maxf = max(flist)
for i in range(len(newterm)):
    tf = 0.5 +( (0.5 * flist[i])/(maxf * 1.0))
    if nlist[i] ==0 :
        idf =math.log(n,10)
    else:
        idf = math.log((n * 1.0 / nlist[i]),10)
    score = tf * idf
    scorelist.append(score)
sortlist=[]
for i in range(len(scorelist)):
    newtuple=(scorelist[i],newterm[i])
    sortlist.append(newtuple)

sortlist.sort()
ll=len(sortlist)
ss=ll - 21
print "**_____________*top ten *____________**"
print
for j in range(ll-1 , ss , -1):
    print "term is :", sortlist[j][1] ,"        score:" , sortlist[j][0]


"""scoref=open ("score.txt" , "w")
for i in range(len(scorelist)):
    scoref.write(newterm[i])
    scoref.write(":")
    scoref.write(str(scorelist[i]))
    scoref.write("\n")


scoref.close()"""
	به نام خدا

	تمام فایل های لازم جهت اجرای کد در لینک های ارائه شده در فاز های قبل قابل مشاهده است.
	#key phrase

	from __future__ import unicode_literals
	from hazm import Normalizer
	normalizer = Normalizer()
	from hazm import Stemmer , Lemmatizer
	stemmer = Stemmer()
	lemmatizer = Lemmatizer()

	s = open("stopword.txt")
	###

	slist =[]
	for line in s:
	line=line.decode("utf8")
	slist.append(line[0:len(line)-2])

	###

	ss=open ("Symbols.txt")
	symbols =[]
	for line in ss:
	line=line.decode("utf8")
	symbols.append(line[0:len(line)-1])




	corpus = open("b.txt","r")

	bodycounter=0
	positionlist=[]
	text = str()
	checker = False
	ld=[]





	##********************
	import codecs


	mylist=[]
	my=[]
	faz=[]
	faz2=[]
	tedad=[]
	chek=[]
	for line in corpus:
	line=line.decode("utf8")
	first=line.find("<TEXT>")
	last=line.find("</TEXT>")
	if last != -1:
	checker = False
	if first != -1 and first < last :#
	text += line[first+6:last]
	elif first != -1 and first > last :
	text += line[:last]
	elif first == -1:
	text += line[:last]

	##************************
	#print text
	wordslist =text.split()

	for j in range(len(chek)):
	chek[j] =0
	#########

	faz=[]
	faz2=[]
	too = len (wordslist)
	for term in wordslist :
	if term not in symbols and term not in slist:
	faz .append(term)
	for i in range(len (faz)-1):
	newfaz = faz[i] +" " + faz[i+1]
	faz2 .append (newfaz)

	#################faz2 shamele hameye dotaE haye kolle matne

	##########
	#for term in wordslist:
	for term in wordslist:
	if term not in mylist and term not in symbols and term not in slist :

	mylist.append(term)


	#####################
	tedad.append(1)
	chek.append(1)
	else :
	#print "booooood "
	for i in range(len(mylist)):
	if mylist[i] == term:
	if chek[i] == 0:
	tedad[i]= tedad[i] +1
	chek[i]=1

	for term in faz2:
	###########



	#############
	###



	if term not in mylist :
	#print "term is " , term
	#######
	# term = stemmer.stem(term)
	#term= lemmatizer.lemmatize(term)
	#print term
	#######
	mylist.append(term)


	#####################
	tedad.append(1)
	chek.append(1)
	else :
	#print "booooood "
	for i in range(len(mylist)):
	if mylist[i] == term:
	if chek[i] == 0:
	tedad[i]= tedad[i] +1
	chek[i]=1



	##***************************

	#ta Enaj yek majmooe asnad ro dar nazar migire va be ezaye tamam e kalamat va bygaram haye majmooe
	#hesab mikone ke har kalame tooye chand ta az asnad vojood dare




	text=""
	position=0
	bodycounter += 1

	x=0

	##################


	if checker:
	text+=line
	if first<>-1 and first > last:
	text += line[first+6:]
	checker = True
	##**************************


	############
	"""for i in range (len( mylist )):
	print mylist[i]
	print tedad[i]
	print chek[i]"""

	#########

	#___tarif e tabe E ke ye vorooD begire va baraye kalamatesh score mohasebe kone ___


	n = bodycounter #tedade kolle asnad
	#print n
	nlist=[]
	newterm=[]
	flist=[]
	newstring=""
	###########

	newcorpus = open("test2.txt", "r")




	newlist=[]
	newstring =""
	for i in range(1):
	for line in newcorpus:
	line=line.decode("utf8")
	newstring = newstring + line

	newlist1 =newstring.split()


	################
	fazz=[]
	fazz2=[]
	too = len (newlist1)
	#print "toole is : " , too
	for term in newlist1 :
	if term not in symbols and term not in slist:
	fazz .append(term)
	#print "term is " , term
	#hameye kalamT ke stop word nistan az in sanda ro dar faz daram
	for i in range(len (fazz)-1):
	newfazz = fazz[i] +" " + fazz[i+1]
	fazz2 .append (newfazz)

	################
	#for ww in newlist1:
	for ww in newlist1:
	if (ww not in slist and ww not in symbols ):
	####
	#ww=stemmer.stem(ww)
	#print ww
	#ww=lemmatizer.lemmatize(ww)
	#print ww
	####
	newlist.append(ww)

	for ww in fazz2 :
	#print "kalamte e sande jaDd : " , ww
	if (ww not in slist and ww not in symbols ):
	####
	#ww=stemmer.stem(ww)
	#print ww
	#ww=lemmatizer.lemmatize(ww)
	#print ww
	####
	newlist.append(ww)

	###############
	for term in newlist:
	if term not in newterm:
	newterm. append(term)
	flist.append(1)
	nlist.append(0)
	else:
	for i in range (len(newterm)):
	if newterm[i]==term:
	flist[i]=flist[i]+1

	for j in range(len(newterm)):
	if newterm[j] not in mylist:
	nlist[j]=0
	else:
	for k in range(len(mylist)):
	if newterm[j]==mylist[k]:
	nlist[j]= tedad[k]




	## ye string ro ke migire baraye hameye kalamatesh Nt o f ro hesab mikone o mirize tooye list ha





	#ye matne vorooD migire va token hash ro miBne age tooye oon liste ghabli bashan mige ke tooye chnad ta az sanad ha hastan





	#___________________tarife score____________________

	scorelist=[]
	import math
	maxf = max(flist)
	for i in range(len(newterm)):
	tf = 0.5 +( (0.5 * flist[i])/(maxf * 1.0))
	if nlist[i] ==0 :
	idf =math.log(n,10)
	else:
	idf = math.log((n * 1.0 / nlist[i]),10)
	score = tf * idf
	scorelist.append(score)
	sortlist=[]
	for i in range(len(scorelist)):
	newtuple=(scorelist[i],newterm[i])
	sortlist.append(newtuple)

	sortlist.sort()
	ll=len(sortlist)
	ss=ll - 21
	print "*_____________top ten ____________*"
	print
	for j in range(ll-1 , ss , -1):
	print "term is :", sortlist[j][1] ," score:" , sortlist[j][0]



	"""scoref=open ("score.txt" , "w")
	for i in range(len(scorelist)):
	scoref.write(newterm[i])
	scoref.write(":")
	scoref.write(str(scorelist[i]))
	scoref.write("\n")



	scoref.close()"""