Skip to content

Instantly share code, notes, and snippets.

@h-alg

h-alg/regexp.py Secret

Last active August 29, 2015 14:02
Show Gist options
  • Save h-alg/f1f6175cf382d7393dac to your computer and use it in GitHub Desktop.
Save h-alg/f1f6175cf382d7393dac to your computer and use it in GitHub Desktop.
import nltk
from nltk.tag import brill
from nltk.tag import crf
from nltk.tag.sequential import *
from nltk.corpus import udhr
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import brown
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tag.brill import SymmetricProximateTokensTemplate, ProximateTokensTemplate
from nltk.tag.brill import ProximateTagsRule, ProximateWordsRule, FastBrillTaggerTrainer
# train_test is alist , first item : training corpus,second one: testin corpus
#this function returns train_test
def BuildCorpus(filename):
train_test=[]
myfile = open(filename,'r+')
corpus=[]
corpus2=[]
for line in myfile :
l=line.split()
if len(l)==3:
l.pop(1)
t=tuple(l)
corpus.append(t)
cor=corpus
x=0
while x<len(cor)-1 :
t=[]
x=x+1
while cor[x][0] != '.':
t.append(cor[x])
x=x+1
if x== len(cor):
break
corpus2.append(t)
train_test.append(corpus2[:5911])
train_test.append(corpus2[5911:])
return train_test
# building our corpus
filename=input("enter the file name please , (put it in single cotations and add .txt at the end):" )
train_test=BuildCorpus(filename)
#test is the list for testing our taggers, a list of tokens
test=[]
training=train_test[0]
testing=train_test[1]
#removing tags from testing to build test
for i in testing:
for j in i :
test.append(j[0])
#word patterns
word_patterns = [
(r'.*ستان$', 'B-LOC'),
(r'.*آباد$', 'B-LOC'),
(r'.*ي$', 'B-PERS'),
(r'.*پور$', 'B-PERS'),
(r'.*نژاد$', 'B-PERS'),
(r'.*زاده$', 'B-PERS'),
]
#our backoff tagger
default_tagger=nltk.DefaultTagger('O')
final_tagger=nltk.tag.RegexpTagger(word_patterns, backoff=default_tagger)
unigram_tagger=nltk.UnigramTagger(training,backoff=final_tagger)
bigram_tagger=nltk.BigramTagger(training,backoff= unigram_tagger)
trigram_tagger=nltk.TrigramTagger(training,backoff=bigram_tagger)
#evaluate removes the tags of testing and retags it then it compares it by our tagger to see how we did
#tagger is our tagging function
def tagger(data):
return trigram_tagger.tag(data)
evaluate=trigram_tagger.evaluate(testing)
#tagged is a file we write our test tokens and tags in each line
#tagging test
learned = tagger(test)
taged=open('tagged.txt','w')
for tag in learned :
taged.write(str(tag[0]))
taged.write('/')
taged.write(str(tag[1]))
taged.write('\n')
taged.close()
#printing the evaluation score
print " the evaluation is:",evaluate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment