cparello/namedEntityRecognizer.py

## namedEntityRecognizer.py
#!/usr/bin/python

import os
import collections
import pickle
from collections import Iterable
from pyexpat import features

from nltk.tag import ClassifierBasedTagger
from nltk.chunk import ChunkParserI, conlltags2tree, tree2conlltags
from nltk import pos_tag, word_tokenize

ner_tags = collections.Counter()

corpus_root = "gmb-2.2.0"  # Make sure you set the proper path to the unzipped corpus


def to_conll_iob(annotated_sentence):
    """
    `annotated_sentence` = list of triplets [(w1, t1, iob1), ...]
    Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
    to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
    """
    proper_iob_tokens = []
    for idx, annotated_token in enumerate(annotated_sentence):
        tag, word, ner = annotated_token

        if ner != 'O':
            if idx == 0:
                ner = "B-" + ner
            elif annotated_sentence[idx - 1][2] == ner:
                ner = "I-" + ner
            else:
                ner = "B-" + ner
        proper_iob_tokens.append((tag, word, ner))
    return proper_iob_tokens


def read_gmb(corpus_root):
    for root, dirs, files in os.walk(corpus_root):
        for filename in files:
            if filename.endswith(".tags"):
                with open(os.path.join(root, filename), 'rb') as file_handle:
                    file_content = file_handle.read().decode('utf-8').strip()
                    annotated_sentences = file_content.split('\n\n')
                    for annotated_sentence in annotated_sentences:
                        annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]

                        standard_form_tokens = []

                        for idx, annotated_token in enumerate(annotated_tokens):
                            annotations = annotated_token.split('\t')
                            word, tag, ner = annotations[0], annotations[1], annotations[3]

                            if ner != 'O':
                                ner = ner.split('-')[0]

                            if tag in ('LQU', 'RQU'):  # Make it NLTK compatible
                                tag = "``"

                            standard_form_tokens.append((word, tag, ner))

                        conll_tokens = to_conll_iob(standard_form_tokens)

                        # Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...]
                        # Because the classfier expects a tuple as input, first item input, second the class
                        yield [((w, t), iob) for w, t, iob in conll_tokens]


class NamedEntityChunker(ChunkParserI):
    def __init__(self, train_sents, **kwargs):
        assert isinstance(train_sents, Iterable)

        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=train_sents,
            feature_detector=features,
            **kwargs)

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)

reader = read_gmb(corpus_root)
data = list(reader)
training_samples = data[:int(len(data) * 0.9)]
test_samples = data[int(len(data) * 0.9):]
chunker = NamedEntityChunker(training_samples[:2000])

print(chunker.parse(pos_tag(word_tokenize("I'm going to Germany this Monday."))))
print("#training samples = %s" % len(training_samples))  # training samples = 55809
print("#test samples = %s" % len(test_samples))  # test samples = 6201
# print(next(reader))
# print('------------')
	#!/usr/bin/python

	import os
	import collections
	import pickle
	from collections import Iterable
	from pyexpat import features

	from nltk.tag import ClassifierBasedTagger
	from nltk.chunk import ChunkParserI, conlltags2tree, tree2conlltags
	from nltk import pos_tag, word_tokenize

	ner_tags = collections.Counter()

	corpus_root = "gmb-2.2.0" # Make sure you set the proper path to the unzipped corpus


	def to_conll_iob(annotated_sentence):
	"""
	`annotated_sentence` = list of triplets [(w1, t1, iob1), ...]
	Transform a pseudo-IOB notation: O, PERSON, PERSON, O, O, LOCATION, O
	to proper IOB notation: O, B-PERSON, I-PERSON, O, O, B-LOCATION, O
	"""
	proper_iob_tokens = []
	for idx, annotated_token in enumerate(annotated_sentence):
	tag, word, ner = annotated_token

	if ner != 'O':
	if idx == 0:
	ner = "B-" + ner
	elif annotated_sentence[idx - 1][2] == ner:
	ner = "I-" + ner
	else:
	ner = "B-" + ner
	proper_iob_tokens.append((tag, word, ner))
	return proper_iob_tokens


	def read_gmb(corpus_root):
	for root, dirs, files in os.walk(corpus_root):
	for filename in files:
	if filename.endswith(".tags"):
	with open(os.path.join(root, filename), 'rb') as file_handle:
	file_content = file_handle.read().decode('utf-8').strip()
	annotated_sentences = file_content.split('\n\n')
	for annotated_sentence in annotated_sentences:
	annotated_tokens = [seq for seq in annotated_sentence.split('\n') if seq]

	standard_form_tokens = []

	for idx, annotated_token in enumerate(annotated_tokens):
	annotations = annotated_token.split('\t')
	word, tag, ner = annotations[0], annotations[1], annotations[3]

	if ner != 'O':
	ner = ner.split('-')[0]

	if tag in ('LQU', 'RQU'): # Make it NLTK compatible
	tag = "``"

	standard_form_tokens.append((word, tag, ner))

	conll_tokens = to_conll_iob(standard_form_tokens)

	# Make it NLTK Classifier compatible - [(w1, t1, iob1), ...] to [((w1, t1), iob1), ...]
	# Because the classfier expects a tuple as input, first item input, second the class
	yield [((w, t), iob) for w, t, iob in conll_tokens]


	class NamedEntityChunker(ChunkParserI):
	def __init__(self, train_sents, **kwargs):
	assert isinstance(train_sents, Iterable)

	self.feature_detector = features
	self.tagger = ClassifierBasedTagger(
	train=train_sents,
	feature_detector=features,
	**kwargs)

	def parse(self, tagged_sent):
	chunks = self.tagger.tag(tagged_sent)

	# Transform the result from [((w1, t1), iob1), ...]
	# to the preferred list of triplets format [(w1, t1, iob1), ...]
	iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

	# Transform the list of triplets to nltk.Tree format
	return conlltags2tree(iob_triplets)

	reader = read_gmb(corpus_root)
	data = list(reader)
	training_samples = data[:int(len(data) * 0.9)]
	test_samples = data[int(len(data) * 0.9):]
	chunker = NamedEntityChunker(training_samples[:2000])

	print(chunker.parse(pos_tag(word_tokenize("I'm going to Germany this Monday."))))
	print("#training samples = %s" % len(training_samples)) # training samples = 55809
	print("#test samples = %s" % len(test_samples)) # test samples = 6201
	# print(next(reader))
	# print('------------')