-
-
Save h-alg/f1f6175cf382d7393dac to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.tag import brill | |
from nltk.tag import crf | |
from nltk.tag.sequential import * | |
from nltk.corpus import udhr | |
from nltk.corpus import PlaintextCorpusReader | |
from nltk.corpus import brown | |
from nltk.corpus.reader import CategorizedPlaintextCorpusReader | |
from nltk.corpus.reader import TaggedCorpusReader | |
from nltk.tag.brill import SymmetricProximateTokensTemplate, ProximateTokensTemplate | |
from nltk.tag.brill import ProximateTagsRule, ProximateWordsRule, FastBrillTaggerTrainer | |
# train_test is alist , first item : training corpus,second one: testin corpus | |
#this function returns train_test | |
def BuildCorpus(filename): | |
train_test=[] | |
myfile = open(filename,'r+') | |
corpus=[] | |
corpus2=[] | |
for line in myfile : | |
l=line.split() | |
if len(l)==3: | |
l.pop(1) | |
t=tuple(l) | |
corpus.append(t) | |
cor=corpus | |
x=0 | |
while x<len(cor)-1 : | |
t=[] | |
x=x+1 | |
while cor[x][0] != '.': | |
t.append(cor[x]) | |
x=x+1 | |
if x== len(cor): | |
break | |
corpus2.append(t) | |
train_test.append(corpus2[:5911]) | |
train_test.append(corpus2[5911:]) | |
return train_test | |
# building our corpus | |
filename=input("enter the file name please , (put it in single cotations and add .txt at the end):" ) | |
train_test=BuildCorpus(filename) | |
#test is the list for testing our taggers, a list of tokens | |
test=[] | |
training=train_test[0] | |
testing=train_test[1] | |
#removing tags from testing to build test | |
for i in testing: | |
for j in i : | |
test.append(j[0]) | |
#word patterns | |
word_patterns = [ | |
(r'.*ستان$', 'B-LOC'), | |
(r'.*آباد$', 'B-LOC'), | |
(r'.*ي$', 'B-PERS'), | |
(r'.*پور$', 'B-PERS'), | |
(r'.*نژاد$', 'B-PERS'), | |
(r'.*زاده$', 'B-PERS'), | |
] | |
#our backoff tagger | |
default_tagger=nltk.DefaultTagger('O') | |
final_tagger=nltk.tag.RegexpTagger(word_patterns, backoff=default_tagger) | |
unigram_tagger=nltk.UnigramTagger(training,backoff=final_tagger) | |
bigram_tagger=nltk.BigramTagger(training,backoff= unigram_tagger) | |
trigram_tagger=nltk.TrigramTagger(training,backoff=bigram_tagger) | |
#evaluate removes the tags of testing and retags it then it compares it by our tagger to see how we did | |
#tagger is our tagging function | |
def tagger(data): | |
return trigram_tagger.tag(data) | |
evaluate=trigram_tagger.evaluate(testing) | |
#tagged is a file we write our test tokens and tags in each line | |
#tagging test | |
learned = tagger(test) | |
taged=open('tagged.txt','w') | |
for tag in learned : | |
taged.write(str(tag[0])) | |
taged.write('/') | |
taged.write(str(tag[1])) | |
taged.write('\n') | |
taged.close() | |
#printing the evaluation score | |
print " the evaluation is:",evaluate | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment