Skip to content

Instantly share code, notes, and snippets.

@h-alg
Last active August 29, 2015 14:02
Show Gist options
  • Save h-alg/4ec991f90a682c6d0a0b to your computer and use it in GitHub Desktop.
Save h-alg/4ec991f90a682c6d0a0b to your computer and use it in GitHub Desktop.
import nltk
from nltk.tag import brill
from nltk.tag import crf
from nltk.tag.sequential import *
from nltk.corpus import udhr
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import brown
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tag.brill import SymmetricProximateTokensTemplate, ProximateTokensTemplate
from nltk.tag.brill import ProximateTagsRule, ProximateWordsRule, FastBrillTaggerTrainer
# train_test is alist , first item : training corpus,second one: testin corpus
#this function returns train_test
def BuildCorpus(filename):
train_test=[]
myfile = open(filename,'r+')
corpus=[]
corpus2=[]
for line in myfile :
l=line.split()
if len(l)==3:
l.pop(1)
t=tuple(l)
corpus.append(t)
cor=corpus
x=0
while x<len(cor)-1 :
t=[]
x=x+1
while cor[x][0] != '.':
t.append(cor[x])
x=x+1
if x== len(cor):
break
corpus2.append(t)
train_test.append(corpus2[:5911])
train_test.append(corpus2[5911:])
return train_test
# building our corpus
filename=input("enter the file name please , (put it in single cotations and add .txt at the end):" )
train_test=BuildCorpus(filename)
#test is the list for testing our taggers, a list of tokens
test=[]
training=train_test[0]
testing=train_test[1]
#removing tags from testing to build test
for i in testing:
for j in i :
test.append(j[0])
#our backoff tagger
default_tagger=nltk.DefaultTagger('O')
unigram_tagger=nltk.UnigramTagger(training,backoff=default_tagger)
bigram_tagger=nltk.BigramTagger(training,backoff= unigram_tagger)
trigram_tagger=nltk.TrigramTagger(training,backoff=bigram_tagger)
#template rules
templates = [
SymmetricProximateTokensTemplate(ProximateTagsRule, (1,1)),
SymmetricProximateTokensTemplate(ProximateTagsRule, (2,2)),
SymmetricProximateTokensTemplate(ProximateTagsRule, (1,2)),
SymmetricProximateTokensTemplate(ProximateTagsRule, (1,3)),
SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)),
SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)),
SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)),
SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)),
ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1,1)),
ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)),
]
trainer = FastBrillTaggerTrainer(initial_tagger=trigram_tagger,
templates=templates, trace=3,
deterministic=True)
brill_tagger = trainer.train(training, max_rules=10)
#evaluate removes the tags of testing and retags it then it compares it by our tagger to see how we did
#tagger is our tagging function
def tagger(data):
return brill_tagger.tag(data)
evaluate=brill_tagger.evaluate(testing)
#tagged is a file we write our test tokens and tags in each line
#tagging test
learned = tagger(test)
taged=open('tagged.txt','w')
for tag in learned :
taged.write(str(tag[0]))
taged.write('/')
taged.write(str(tag[1]))
taged.write('\n')
taged.close()
#printing the evaluation score
print " the evaluation is:",evaluate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment