Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
import os
import collections
import pickle
from collections import Iterable
from pyexpat import features
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI, conlltags2tree, tree2conlltags
from nltk import pos_tag, word_tokenize
ner_tags = collections.Counter()
corpus_root = "gmb-2.2.0" # Make sure you set the proper path to the unzipped corpus
def to_conll_iob(annotated_sentence):
`annotated_sentence` = list of triplets [(w1, t1, iob1), ...]
Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
proper_iob_tokens = []
for idx, annotated_token in enumerate(annotated_sentence):
tag, word, ner = annotated_token
if ner != 'O':
if idx == 0:
ner = "B-" + ner
elif annotated_sentence[idx - 1][2] == ner:
ner = "I-" + ner
ner = "B-" + ner
proper_iob_tokens.append((tag, word, ner))
return proper_iob_tokens
def read_gmb(corpus_root):
for root, dirs, files in os.walk(corpus_root):
for filename in files:
if filename.endswith(".tags"):
with open(os.path.join(root, filename), 'rb') as file_handle:
file_content ='utf-8').strip()
annotated_sentences = file_content.split('\n\n')
for annotated_sentence in annotated_sentences:
annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]
standard_form_tokens = []
for idx, annotated_token in enumerate(annotated_tokens):
annotations = annotated_token.split('\t')
word, tag, ner = annotations[0], annotations[1], annotations[3]
if ner != 'O':
ner = ner.split('-')[0]
if tag in ('LQU', 'RQU'): # Make it NLTK compatible
tag = "``"
standard_form_tokens.append((word, tag, ner))
conll_tokens = to_conll_iob(standard_form_tokens)
# Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...]
# Because the classfier expects a tuple as input, first item input, second the class
yield [((w, t), iob) for w, t, iob in conll_tokens]
class NamedEntityChunker(ChunkParserI):
def __init__(self, train_sents, **kwargs):
assert isinstance(train_sents, Iterable)
self.feature_detector = features
self.tagger = ClassifierBasedTagger(
def parse(self, tagged_sent):
chunks = self.tagger.tag(tagged_sent)
# Transform the result from [((w1, t1), iob1), ...]
# to the preferred list of triplets format [(w1, t1, iob1), ...]
iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
# Transform the list of triplets to nltk.Tree format
return conlltags2tree(iob_triplets)
reader = read_gmb(corpus_root)
data = list(reader)
training_samples = data[:int(len(data) * 0.9)]
test_samples = data[int(len(data) * 0.9):]
chunker = NamedEntityChunker(training_samples[:2000])
print(chunker.parse(pos_tag(word_tokenize("I'm going to Germany this Monday."))))
print("#training samples = %s" % len(training_samples)) # training samples = 55809
print("#test samples = %s" % len(test_samples)) # test samples = 6201
# print(next(reader))
# print('------------')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.