Skip to content

Instantly share code, notes, and snippets.

@h-alg

h-alg/first.py Secret

Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save h-alg/9f4de0e1b5c0a5a9f9e6 to your computer and use it in GitHub Desktop.
Save h-alg/9f4de0e1b5c0a5a9f9e6 to your computer and use it in GitHub Desktop.
import nltk
from nltk.tag.sequential import *
from nltk.corpus import udhr
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import brown
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus.reader import TaggedCorpusReader
# train_test is alist , first item : training corpus,second one: testin corpus
#this function returns train_test
def BuildCorpus(filename):
train_test=[]
train=open("train.txt",'w')
test=open("test.txt",'w')
myfile = open(filename,'r+')
corpus=[]
corpus2=[]
for line in myfile :
l=line.split()
if len(l)==3:
l.pop(1)
t=tuple(l)
corpus.append(t)
cor=corpus
x=0
while x<len(cor)-1 :
t=[]
x=x+1
while cor[x][0] != '.':
t.append(cor[x])
x=x+1
if x== len(cor):
break
corpus2.append(t)
train_test.append(corpus2[:5911])
train_test.append(corpus2[5911:])
return train_test
# building our corpus
filename=input("enter the file name please:[put it in ""]")
train_test=BuildCorpus(filename)
#test is the list for testing our taggers, a list of tokens
test=[]
training=train_test[0]
testing=train_test[1]
#removing tags from testing to build test
for i in testing:
for j in i :
test.append(j[0])
#our backoff tagger
default_tagger=nltk.DefaultTagger('O')
unigram_tagger=nltk.UnigramTagger(training,backoff=default_tagger)
bigram_tagger=nltk.BigramTagger(training,backoff= unigram_tagger)
trigram_tagger=nltk.TrigramTagger(training,backoff=bigram_tagger)
#evaluate removes the tags of testing and retags it then it compares it by our tagger to see how we did
#tagger is our tagging function
def tagger(data):
return trigram_tagger.tag(data)
evaluate=trigram_tagger.evaluate(testing)
#tagged is a file we write our test tokens and tags in each line
#tagging test
learned = tagger(test)
taged=open('tagged.txt','w')
for tag in learned :
taged.write(str(tag[0]))
taged.write('/')
taged.write(str(tag[1]))
taged.write('\n')
taged.close()
#printing the evaluation score
print " the evaluation is:",evaluate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment