-
-
Save h-alg/9f4de0e1b5c0a5a9f9e6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.tag.sequential import * | |
from nltk.corpus import udhr | |
from nltk.corpus import PlaintextCorpusReader | |
from nltk.corpus import brown | |
from nltk.corpus.reader import CategorizedPlaintextCorpusReader | |
from nltk.corpus.reader import TaggedCorpusReader | |
# train_test is alist , first item : training corpus,second one: testin corpus | |
#this function returns train_test | |
def BuildCorpus(filename): | |
train_test=[] | |
train=open("train.txt",'w') | |
test=open("test.txt",'w') | |
myfile = open(filename,'r+') | |
corpus=[] | |
corpus2=[] | |
for line in myfile : | |
l=line.split() | |
if len(l)==3: | |
l.pop(1) | |
t=tuple(l) | |
corpus.append(t) | |
cor=corpus | |
x=0 | |
while x<len(cor)-1 : | |
t=[] | |
x=x+1 | |
while cor[x][0] != '.': | |
t.append(cor[x]) | |
x=x+1 | |
if x== len(cor): | |
break | |
corpus2.append(t) | |
train_test.append(corpus2[:5911]) | |
train_test.append(corpus2[5911:]) | |
return train_test | |
# building our corpus | |
filename=input("enter the file name please:[put it in ""]") | |
train_test=BuildCorpus(filename) | |
#test is the list for testing our taggers, a list of tokens | |
test=[] | |
training=train_test[0] | |
testing=train_test[1] | |
#removing tags from testing to build test | |
for i in testing: | |
for j in i : | |
test.append(j[0]) | |
#our backoff tagger | |
default_tagger=nltk.DefaultTagger('O') | |
unigram_tagger=nltk.UnigramTagger(training,backoff=default_tagger) | |
bigram_tagger=nltk.BigramTagger(training,backoff= unigram_tagger) | |
trigram_tagger=nltk.TrigramTagger(training,backoff=bigram_tagger) | |
#evaluate removes the tags of testing and retags it then it compares it by our tagger to see how we did | |
#tagger is our tagging function | |
def tagger(data): | |
return trigram_tagger.tag(data) | |
evaluate=trigram_tagger.evaluate(testing) | |
#tagged is a file we write our test tokens and tags in each line | |
#tagging test | |
learned = tagger(test) | |
taged=open('tagged.txt','w') | |
for tag in learned : | |
taged.write(str(tag[0])) | |
taged.write('/') | |
taged.write(str(tag[1])) | |
taged.write('\n') | |
taged.close() | |
#printing the evaluation score | |
print " the evaluation is:",evaluate | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment