Skip to content

Instantly share code, notes, and snippets.

@bogsio
Last active February 29, 2020 00:18
Show Gist options
  • Star 14 You must be signed in to star a gist
  • Fork 8 You must be signed in to fork a gist
  • Save bogsio/7e0b733ffcf18dfc1fef45d790b8403e to your computer and use it in GitHub Desktop.
Save bogsio/7e0b733ffcf18dfc1fef45d790b8403e to your computer and use it in GitHub Desktop.
NER Python
# https://nlpforhackers.io/named-entity-extraction/
import os
import string
import collections
import pickle
from collections import Iterable
from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI, conlltags2tree, tree2conlltags
from nltk import pos_tag, word_tokenize
from nltk.stem import SnowballStemmer
ner_tags = collections.Counter()
corpus_root = "gmb-2.2.0" # Make sure you set the proper path to the unzipped corpus
def to_conll_iob(annotated_sentence):
"""
`annotated_sentence` = list of triplets [(w1, t1, iob1), ...]
Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
"""
proper_iob_tokens = []
for idx, annotated_token in enumerate(annotated_sentence):
tag, word, ner = annotated_token
if ner != 'O':
if idx == 0:
ner = "B-" + ner
elif annotated_sentence[idx - 1][2] == ner:
ner = "I-" + ner
else:
ner = "B-" + ner
proper_iob_tokens.append((tag, word, ner))
return proper_iob_tokens
def features(tokens, index, history):
"""
`tokens` = a POS-tagged sentence [(w1, t1), ...]
`index` = the index of the token we want to extract features for
`history` = the previous predicted IOB tags
"""
# init the stemmer
stemmer = SnowballStemmer('english')
# Pad the sequence with placeholders
tokens = [('[START2]', '[START2]'), ('[START1]', '[START1]')] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
history = ['[START2]', '[START1]'] + list(history)
# shift the index with 2, to accommodate the padding
index += 2
word, pos = tokens[index]
prevword, prevpos = tokens[index - 1]
prevprevword, prevprevpos = tokens[index - 2]
nextword, nextpos = tokens[index + 1]
nextnextword, nextnextpos = tokens[index + 2]
previob = history[index - 1]
contains_dash = '-' in word
contains_dot = '.' in word
allascii = all([True for c in word if c in string.ascii_lowercase])
allcaps = word == word.capitalize()
capitalized = word[0] in string.ascii_uppercase
prevallcaps = prevword == prevword.capitalize()
prevcapitalized = prevword[0] in string.ascii_uppercase
nextallcaps = prevword == prevword.capitalize()
nextcapitalized = prevword[0] in string.ascii_uppercase
return {
'word': word,
'lemma': stemmer.stem(word),
'pos': pos,
'all-ascii': allascii,
'next-word': nextword,
'next-lemma': stemmer.stem(nextword),
'next-pos': nextpos,
'next-next-word': nextnextword,
'nextnextpos': nextnextpos,
'prev-word': prevword,
'prev-lemma': stemmer.stem(prevword),
'prev-pos': prevpos,
'prev-prev-word': prevprevword,
'prev-prev-pos': prevprevpos,
'prev-iob': previob,
'contains-dash': contains_dash,
'contains-dot': contains_dot,
'all-caps': allcaps,
'capitalized': capitalized,
'prev-all-caps': prevallcaps,
'prev-capitalized': prevcapitalized,
'next-all-caps': nextallcaps,
'next-capitalized': nextcapitalized,
}
def read_gmb(corpus_root):
for root, dirs, files in os.walk(corpus_root):
for filename in files:
if filename.endswith(".tags"):
with open(os.path.join(root, filename), 'rb') as file_handle:
file_content = file_handle.read().decode('utf-8').strip()
annotated_sentences = file_content.split('\n\n')
for annotated_sentence in annotated_sentences:
annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]
standard_form_tokens = []
for idx, annotated_token in enumerate(annotated_tokens):
annotations = annotated_token.split('\t')
word, tag, ner = annotations[0], annotations[1], annotations[3]
if ner != 'O':
ner = ner.split('-')[0]
if tag in ('LQU', 'RQU'): # Make it NLTK compatible
tag = "``"
standard_form_tokens.append((word, tag, ner))
conll_tokens = to_conll_iob(standard_form_tokens)
# Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...]
# Because the classfier expects a tuple as input, first item input, second the class
yield [((w, t), iob) for w, t, iob in conll_tokens]
class NamedEntityChunker(ChunkParserI):
def __init__(self, train_sents, **kwargs):
assert isinstance(train_sents, Iterable)
self.feature_detector = features
self.tagger = ClassifierBasedTagger(
train=train_sents,
feature_detector=features,
**kwargs)
def parse(self, tagged_sent):
chunks = self.tagger.tag(tagged_sent)
# Transform the result from [((w1, t1), iob1), ...]
# to the preferred list of triplets format [(w1, t1, iob1), ...]
iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
# Transform the list of triplets to nltk.Tree format
return conlltags2tree(iob_triplets)
reader = read_gmb(corpus_root)
data = list(reader)
training_samples = data[:int(len(data) * 0.9)]
test_samples = data[int(len(data) * 0.9):]
chunker = NamedEntityChunker(training_samples[:2000])
print(chunker.parse(pos_tag(word_tokenize("I'm going to Germany this Monday."))))
print("#training samples = %s" % len(training_samples)) # training samples = 55809
print("#test samples = %s" % len(test_samples)) # test samples = 6201
@irfanandratama
Copy link

Are you sure line 27 and line 36 are correct? Because when we read the corpus we assign tuple into (word, tag, ner) but in line 27 and 36 we assign (tag, word, ner). Why change it?

@ketul14
Copy link

ketul14 commented Oct 5, 2018

@bogdan-ivanov: Excellent! Thanks. How is the performance of this as there are many for loops? Is it faster than compare to Standford NER?

@samsav
Copy link

samsav commented Dec 12, 2018

Thanks for the tutorial! I noticed that there appears to be a slight error on lines 72 and 73: the features nextallcaps and nextcapitalized reference the previous word, not the next word.

@madhurirao225
Copy link

madhurirao225 commented Jan 10, 2019

Thanks for the code,but i got error at line 148 or 149,167.I worked on it,but i didn't get it ,can please provide datasets for your code.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment