h-alg/regexp.py Secret

## regexp.py
import nltk
from nltk.tag import brill
from nltk.tag import crf
from nltk.tag.sequential import *
from nltk.corpus import udhr
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import brown
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tag.brill import SymmetricProximateTokensTemplate, ProximateTokensTemplate
from nltk.tag.brill import ProximateTagsRule, ProximateWordsRule, FastBrillTaggerTrainer

# train_test is alist , first item : training corpus,second one: testin corpus
#this function returns train_test

def BuildCorpus(filename):
    train_test=[]
    myfile = open(filename,'r+')

    corpus=[]
    corpus2=[]
    for line in myfile :
        l=line.split()
        if len(l)==3:
            l.pop(1)
            t=tuple(l)
            corpus.append(t)
    cor=corpus


    x=0

    while x<len(cor)-1 :
        t=[]
        x=x+1

        while cor[x][0] != '.':
            t.append(cor[x])
            x=x+1
            if x== len(cor):
                break
        corpus2.append(t)


    train_test.append(corpus2[:5911])
    train_test.append(corpus2[5911:])
    return train_test


# building our corpus
filename=input("enter the file name please , (put it in single cotations and add .txt at the end):" )
train_test=BuildCorpus(filename)

#test is the list for testing our taggers, a list of tokens

test=[]
training=train_test[0]

testing=train_test[1]


#removing tags from testing to build test

for i in testing:
    for j in i :
        test.append(j[0])

#word patterns

word_patterns = [

    (r'.*ستان$', 'B-LOC'),
    (r'.*آباد$', 'B-LOC'),
    (r'.*ي$', 'B-PERS'),
    (r'.*پور$', 'B-PERS'),
    (r'.*نژاد$', 'B-PERS'),
    (r'.*زاده$', 'B-PERS'),

]


#our backoff tagger

default_tagger=nltk.DefaultTagger('O')
final_tagger=nltk.tag.RegexpTagger(word_patterns, backoff=default_tagger)
unigram_tagger=nltk.UnigramTagger(training,backoff=final_tagger)
bigram_tagger=nltk.BigramTagger(training,backoff= unigram_tagger)
trigram_tagger=nltk.TrigramTagger(training,backoff=bigram_tagger)


#evaluate removes the tags of testing and retags it then it compares it by our tagger to see how we did
#tagger is our tagging function

def tagger(data):
    return trigram_tagger.tag(data)

evaluate=trigram_tagger.evaluate(testing)

#tagged is a file we write our test tokens and tags in each line

#tagging test

learned = tagger(test)


taged=open('tagged.txt','w')
for tag in learned :
    taged.write(str(tag[0]))
    taged.write('/')
    taged.write(str(tag[1]))
    taged.write('\n')
taged.close()

#printing the evaluation score


print " the evaluation is:",evaluate
	import nltk
	from nltk.tag import brill
	from nltk.tag import crf
	from nltk.tag.sequential import *
	from nltk.corpus import udhr
	from nltk.corpus import PlaintextCorpusReader
	from nltk.corpus import brown
	from nltk.corpus.reader import CategorizedPlaintextCorpusReader
	from nltk.corpus.reader import TaggedCorpusReader
	from nltk.tag.brill import SymmetricProximateTokensTemplate, ProximateTokensTemplate
	from nltk.tag.brill import ProximateTagsRule, ProximateWordsRule, FastBrillTaggerTrainer

	# train_test is alist , first item : training corpus,second one: testin corpus
	#this function returns train_test

	def BuildCorpus(filename):
	train_test=[]
	myfile = open(filename,'r+')

	corpus=[]
	corpus2=[]
	for line in myfile :
	l=line.split()
	if len(l)==3:
	l.pop(1)
	t=tuple(l)
	corpus.append(t)
	cor=corpus


	x=0

	while x<len(cor)-1 :
	t=[]
	x=x+1

	while cor[x][0] != '.':
	t.append(cor[x])
	x=x+1
	if x== len(cor):
	break
	corpus2.append(t)


	train_test.append(corpus2[:5911])
	train_test.append(corpus2[5911:])
	return train_test



	# building our corpus
	filename=input("enter the file name please , (put it in single cotations and add .txt at the end):" )
	train_test=BuildCorpus(filename)

	#test is the list for testing our taggers, a list of tokens

	test=[]
	training=train_test[0]

	testing=train_test[1]


	#removing tags from testing to build test

	for i in testing:
	for j in i :
	test.append(j[0])

	#word patterns

	word_patterns = [

	(r'.*ستان$', 'B-LOC'),
	(r'.*آباد$', 'B-LOC'),
	(r'.*ي$', 'B-PERS'),
	(r'.*پور$', 'B-PERS'),
	(r'.*نژاد$', 'B-PERS'),
	(r'.*زاده$', 'B-PERS'),

	]


	#our backoff tagger

	default_tagger=nltk.DefaultTagger('O')
	final_tagger=nltk.tag.RegexpTagger(word_patterns, backoff=default_tagger)
	unigram_tagger=nltk.UnigramTagger(training,backoff=final_tagger)
	bigram_tagger=nltk.BigramTagger(training,backoff= unigram_tagger)
	trigram_tagger=nltk.TrigramTagger(training,backoff=bigram_tagger)




	#evaluate removes the tags of testing and retags it then it compares it by our tagger to see how we did
	#tagger is our tagging function

	def tagger(data):
	return trigram_tagger.tag(data)

	evaluate=trigram_tagger.evaluate(testing)

	#tagged is a file we write our test tokens and tags in each line

	#tagging test

	learned = tagger(test)


	taged=open('tagged.txt','w')
	for tag in learned :
	taged.write(str(tag[0]))
	taged.write('/')
	taged.write(str(tag[1]))
	taged.write('\n')
	taged.close()

	#printing the evaluation score


	print " the evaluation is:",evaluate