Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#!/usr/bin/python
import os
import collections
import pickle
from collections import Iterable
from pyexpat import features
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI, conlltags2tree, tree2conlltags
from nltk import pos_tag, word_tokenize
ner_tags = collections.Counter()
corpus_root = "gmb-2.2.0" # Make sure you set the proper path to the unzipped corpus
def to_conll_iob(annotated_sentence):
"""
`annotated_sentence` = list of triplets [(w1, t1, iob1), ...]
Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
"""
proper_iob_tokens = []
for idx, annotated_token in enumerate(annotated_sentence):
tag, word, ner = annotated_token
if ner != 'O':
if idx == 0:
ner = "B-" + ner
elif annotated_sentence[idx - 1][2] == ner:
ner = "I-" + ner
else:
ner = "B-" + ner
proper_iob_tokens.append((tag, word, ner))
return proper_iob_tokens
def read_gmb(corpus_root):
for root, dirs, files in os.walk(corpus_root):
for filename in files:
if filename.endswith(".tags"):
with open(os.path.join(root, filename), 'rb') as file_handle:
file_content = file_handle.read().decode('utf-8').strip()
annotated_sentences = file_content.split('\n\n')
for annotated_sentence in annotated_sentences:
annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]
standard_form_tokens = []
for idx, annotated_token in enumerate(annotated_tokens):
annotations = annotated_token.split('\t')
word, tag, ner = annotations[0], annotations[1], annotations[3]
if ner != 'O':
ner = ner.split('-')[0]
if tag in ('LQU', 'RQU'): # Make it NLTK compatible
tag = "``"
standard_form_tokens.append((word, tag, ner))
conll_tokens = to_conll_iob(standard_form_tokens)
# Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...]
# Because the classfier expects a tuple as input, first item input, second the class
yield [((w, t), iob) for w, t, iob in conll_tokens]
class NamedEntityChunker(ChunkParserI):
def __init__(self, train_sents, **kwargs):
assert isinstance(train_sents, Iterable)
self.feature_detector = features
self.tagger = ClassifierBasedTagger(
train=train_sents,
feature_detector=features,
**kwargs)
def parse(self, tagged_sent):
chunks = self.tagger.tag(tagged_sent)
# Transform the result from [((w1, t1), iob1), ...]
# to the preferred list of triplets format [(w1, t1, iob1), ...]
iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
# Transform the list of triplets to nltk.Tree format
return conlltags2tree(iob_triplets)
reader = read_gmb(corpus_root)
data = list(reader)
training_samples = data[:int(len(data) * 0.9)]
test_samples = data[int(len(data) * 0.9):]
chunker = NamedEntityChunker(training_samples[:2000])
print(chunker.parse(pos_tag(word_tokenize("I'm going to Germany this Monday."))))
print("#training samples = %s" % len(training_samples)) # training samples = 55809
print("#test samples = %s" % len(test_samples)) # test samples = 6201
# print(next(reader))
# print('------------')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.