Skip to content

Instantly share code, notes, and snippets.

@alep
Last active August 29, 2015 14:00
Show Gist options
  • Save alep/11297967 to your computer and use it in GitHub Desktop.
Save alep/11297967 to your computer and use it in GitHub Desktop.
import nltk
from collections import namedtuple
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import FeatureHasher
Data = namedtuple('Data', [
'word_i_minus_2',
'tag_i_minus_2',
'word_i_minus_1',
'tag_i_minus_1',
'word',
'tag',
'word_i_plus_1',
'word_i_plus_2',
])
class LRTagger(object):
def preprocess_train(self, tagged_sentence):
for ngram in nltk.util.ngrams(
tagged_sentence, 5, pad_symbol=("__none__", "__none__"),
pad_left=True, pad_right=True):
wt_m2, wt_m1, wt, wt_p1, wt_p2 = ngram
if wt != ("__none__", "__none__"):
yield Data(
tag_i_minus_2=str(wt_m2[1]),
word_i_minus_2=str(wt_m2[0]),
tag_i_minus_1=str(wt_m1[1]),
word_i_minus_1=str(wt_m1[0]),
word=str(wt[0]),
tag=str(wt[1]),
word_i_plus_1=str(wt_p1[0]),
word_i_plus_2=str(wt_p2[0]),)
def preprocess_test(self, tagged_sentence):
for ngram in nltk.util.ngrams(
tagged_sentence, 5, pad_symbol="__none__",
pad_left=True, pad_right=True):
w_m2, w_m1, w, w_p1, w_p2 = ngram
if w != "__none__":
yield Data(
tag_i_minus_2="__none__",
word_i_minus_2=w_m2,
tag_i_minus_1="__none__",
word_i_minus_1=w_m1,
word=w,
tag="__none__",
word_i_plus_1=w_p1,
word_i_plus_2=w_p2,)
def feat(self, data):
features = [
lambda data: data.tag_i_minus_1,
lambda data: "%s %s" % (data.tag_i_minus_2, data.tag_i_minus_1),
lambda data: data.word_i_minus_1,
lambda data: data.word_i_minus_2,
lambda data: data.word_i_plus_1,
lambda data: data.word_i_plus_2,
lambda data: data.word,
lambda data: (any(digit in data.word for digit in '1234567890')
and '__hasnum__' or '__nonum__'),
lambda data: '-' in data.word and '__hyphen__' or '__nohyphen__',
lambda data: (any(map(lambda x: x.isupper, data.word))
and '__upper__' or '__noupper__'),
lambda data: data.word[:2],
lambda data: data.word[-2:],
lambda data: data.word[:3],
lambda data: data.word[-3:],
lambda data: data.word[:4],
lambda data: data.word[-4:],
]
for f in features:
yield f(data)
def transform(self, train):
for sent in train:
for feature in self.preprocess_train(sent):
yield feature
def finder(self, test):
# Must explore using search, where the option would
# be using the history or not.
tag_i_minus_1 = "__none__"
tag_i_minus_2 = "__none__"
for sample in test:
sample = sample._replace(
tag_i_minus_1=tag_i_minus_1,
tag_i_minus_2=tag_i_minus_2)
hashed_sample = self.hasher.transform(
[self.feat(sample)])
predicted_tag = self.model.predict(hashed_sample)
tag_i_minus_2 = tag_i_minus_1
tag_i_minus_1 = str(predicted_tag[0])
yield sample.word, str(predicted_tag[0])
def __init__(self, train):
self.hasher = FeatureHasher(input_type='string')
self.train = list(self.transform(train))
X = self.hasher.transform(self.feat(d) for d in self.train)
y = map(lambda d: d.tag, self.train)
self.model = LogisticRegression()
self.model.fit(X, y)
def tag(self, words):
return list(self.finder(self.preprocess_test(words)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment