Skip to content

Instantly share code, notes, and snippets.

@fjavieralba
Last active September 6, 2018 09:50
Show Gist options
  • Save fjavieralba/3969016 to your computer and use it in GitHub Desktop.
Save fjavieralba/3969016 to your computer and use it in GitHub Desktop.
Python class for tagging text with dictionaries
class DictionaryTagger(object):
def __init__(self, dictionary_paths):
files = [open(path, 'r') for path in dictionary_paths]
dictionaries = [yaml.load(dict_file) for dict_file in files]
map(lambda x: x.close(), files)
self.dictionary = {}
self.max_key_size = 0
for curr_dict in dictionaries:
for key in curr_dict:
if key in self.dictionary:
self.dictionary[key].extend(curr_dict[key])
else:
self.dictionary[key] = curr_dict[key]
self.max_key_size = max(self.max_key_size, len(key))
def tag(self, postagged_sentences):
return [self.tag_sentence(sentence) for sentence in postagged_sentences]
def tag_sentence(self, sentence, tag_with_lemmas=False):
"""
the result is only one tagging of all the possible ones.
The resulting tagging is determined by these two priority rules:
- longest matches have higher priority
- search is made from left to right
"""
tag_sentence = []
N = len(sentence)
if self.max_key_size == 0:
self.max_key_size = N
i = 0
while (i < N):
j = min(i + self.max_key_size, N) #avoid overflow
tagged = False
while (j > i):
expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
if tag_with_lemmas:
literal = expression_lemma
else:
literal = expression_form
if literal in self.dictionary:
#self.logger.debug("found: %s" % literal)
is_single_token = j - i == 1
original_position = i
i = j
taggings = [tag for tag in self.dictionary[literal]]
tagged_expression = (expression_form, expression_lemma, taggings)
if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
original_token_tagging = sentence[original_position][2]
tagged_expression[2].extend(original_token_tagging)
tag_sentence.append(tagged_expression)
tagged = True
else:
j = j - 1
if not tagged:
tag_sentence.append(sentence[i])
i += 1
return tag_sentence
dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml', 'dicts/inc.yml', 'dicts/dec.yml'])
dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
pprint(dict_tagged_sentences)
[[('What', 'What', ['WP']),
('can', 'can', ['MD']),
('I', 'I', ['PRP']),
('say', 'say', ['VB']),
('about', 'about', ['IN']),
('this', 'this', ['DT']),
('place', 'place', ['NN']),
('.', '.', ['.'])],
[('The', 'The', ['DT']),
('staff', 'staff', ['NN']),
('of', 'of', ['IN']),
('the', 'the', ['DT']),
('restaurant', 'restaurant', ['NN']),
('is', 'is', ['VBZ']),
('nice', 'nice', ['positive', 'JJ']),
('and', 'and', ['CC']),
('eggplant', 'eggplant', ['NN']),
('is', 'is', ['VBZ']),
('not', 'not', ['RB']),
('bad', 'bad', ['negative', 'JJ']),
('.', '.', ['.'])],
[('apart', 'apart', ['NN']),
('from', 'from', ['IN']),
('that', 'that', ['DT']),
(',', ',', [',']),
('very', 'very', ['inc', 'RB']),
('uninspired', 'uninspired', ['negative', 'VBN']),
('food', 'food', ['NN']),
(',', ',', [',']),
('lack', 'lack', ['NN']),
('of', 'of', ['IN']),
('atmosphere', 'atmosphere', ['NN']),
('and', 'and', ['CC']),
('too', 'too', ['inc', 'RB']),
('expensive', 'expensive', ['negative', 'JJ']),
('.', '.', ['.'])],
[('I', 'I', ['PRP']),
('am', 'am', ['VBP']),
('a', 'a', ['DT']),
('staunch', 'staunch', ['NN']),
('vegetarian', 'vegetarian', ['NN']),
('and', 'and', ['CC']),
('was', 'was', ['VBD']),
('sorely', 'sorely', ['inc', 'RB']),
('dissapointed', 'dissapointed', ['negative', 'VBN']),
('with', 'with', ['IN']),
('the', 'the', ['DT']),
('veggie', 'veggie', ['NN']),
('options', 'options', ['NNS']),
('on', 'on', ['IN']),
('the', 'the', ['DT']),
('menu', 'menu', ['NN']),
('.', '.', ['.'])],
[('Will', 'Will', ['NNP']),
('be', 'be', ['VB']),
('the', 'the', ['DT']),
('last', 'last', ['JJ']),
('time', 'time', ['NN']),
('I', 'I', ['PRP']),
('visit', 'visit', ['VBP']),
(',', ',', [',']),
('I', 'I', ['PRP']),
('recommend others to avoid', 'recommend others to avoid', ['negative']),
('.', '.', ['.'])]]
dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml', 'dicts/inc.yml', 'dicts/dec.yml', 'dicts/inv.yml'])
dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
pprint(dict_tagged_sentences)
[[('What', 'What', ['WP']),
('can', 'can', ['MD']),
('I', 'I', ['PRP']),
('say', 'say', ['VB']),
('about', 'about', ['IN']),
('this', 'this', ['DT']),
('place', 'place', ['NN']),
('.', '.', ['.'])],
[('The', 'The', ['DT']),
('staff', 'staff', ['NN']),
('of', 'of', ['IN']),
('the', 'the', ['DT']),
('restaurant', 'restaurant', ['NN']),
('is', 'is', ['VBZ']),
('nice', 'nice', ['positive', 'JJ']),
('and', 'and', ['CC']),
('eggplant', 'eggplant', ['NN']),
('is', 'is', ['VBZ']),
('not', 'not', ['inv', 'RB']),
('bad', 'bad', ['negative', 'JJ']),
('.', '.', ['.'])],
[('apart', 'apart', ['NN']),
('from', 'from', ['IN']),
('that', 'that', ['DT']),
(',', ',', [',']),
('very', 'very', ['inc', 'RB']),
('uninspired', 'uninspired', ['negative', 'VBN']),
('food', 'food', ['NN']),
(',', ',', [',']),
('lack of', 'lack of', ['inv']),
('atmosphere', 'atmosphere', ['NN']),
('and', 'and', ['CC']),
('too', 'too', ['inc', 'RB']),
('expensive', 'expensive', ['negative', 'JJ']),
('.', '.', ['.'])],
[('I', 'I', ['PRP']),
('am', 'am', ['VBP']),
('a', 'a', ['DT']),
('staunch', 'staunch', ['NN']),
('vegetarian', 'vegetarian', ['NN']),
('and', 'and', ['CC']),
('was', 'was', ['VBD']),
('sorely', 'sorely', ['inc', 'RB']),
('dissapointed', 'dissapointed', ['negative', 'VBN']),
('with', 'with', ['IN']),
('the', 'the', ['DT']),
('veggie', 'veggie', ['NN']),
('options', 'options', ['NNS']),
('on', 'on', ['IN']),
('the', 'the', ['DT']),
('menu', 'menu', ['NN']),
('.', '.', ['.'])],
[('Will', 'Will', ['NNP']),
('be', 'be', ['VB']),
('the', 'the', ['DT']),
('last', 'last', ['JJ']),
('time', 'time', ['NN']),
('I', 'I', ['PRP']),
('visit', 'visit', ['VBP']),
(',', ',', [',']),
('I', 'I', ['PRP']),
('recommend others to avoid', 'recommend others to avoid', ['negative']),
('.', '.', ['.'])]]
dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml'])
dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
pprint(dict_tagged_sentences)
[[('What', 'What', ['WP']),
('can', 'can', ['MD']),
('I', 'I', ['PRP']),
('say', 'say', ['VB']),
('about', 'about', ['IN']),
('this', 'this', ['DT']),
('place', 'place', ['NN']),
('.', '.', ['.'])],
[('The', 'The', ['DT']),
('staff', 'staff', ['NN']),
('of', 'of', ['IN']),
('the', 'the', ['DT']),
('restaurant', 'restaurant', ['NN']),
('is', 'is', ['VBZ']),
('nice', 'nice', ['positive', 'JJ']),
('and', 'and', ['CC']),
('eggplant', 'eggplant', ['NN']),
('is', 'is', ['VBZ']),
('not', 'not', ['RB']),
('bad', 'bad', ['negative', 'JJ']),
('.', '.', ['.'])],
[('apart', 'apart', ['NN']),
('from', 'from', ['IN']),
('that', 'that', ['DT']),
(',', ',', [',']),
('very', 'very', ['RB']),
('uninspired', 'uninspired', ['negative', 'VBN']),
('food', 'food', ['NN']),
(',', ',', [',']),
('lack', 'lack', ['NN']),
('of', 'of', ['IN']),
('atmosphere', 'atmosphere', ['NN']),
('and', 'and', ['CC']),
('too', 'too', ['RB']),
('expensive', 'expensive', ['negative', 'JJ']),
('.', '.', ['.'])],
[('I', 'I', ['PRP']),
('am', 'am', ['VBP']),
('a', 'a', ['DT']),
('staunch', 'staunch', ['NN']),
('vegetarian', 'vegetarian', ['NN']),
('and', 'and', ['CC']),
('was', 'was', ['VBD']),
('sorely', 'sorely', ['RB']),
('dissapointed', 'dissapointed', ['negative', 'VBN']),
('with', 'with', ['IN']),
('the', 'the', ['DT']),
('veggie', 'veggie', ['NN']),
('options', 'options', ['NNS']),
('on', 'on', ['IN']),
('the', 'the', ['DT']),
('menu', 'menu', ['NN']),
('.', '.', ['.'])],
[('Will', 'Will', ['NNP']),
('be', 'be', ['VB']),
('the', 'the', ['DT']),
('last', 'last', ['JJ']),
('time', 'time', ['NN']),
('I', 'I', ['PRP']),
('visit', 'visit', ['VBP']),
(',', ',', [',']),
('I', 'I', ['PRP']),
('recommend others to avoid', 'recommend others to avoid', ['negative']),
('.', '.', ['.'])]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment