fjavieralba/dictionary_tagger.py

## dictionary_tagger.py
class DictionaryTagger(object):
    def __init__(self, dictionary_paths):
        files = [open(path, 'r') for path in dictionary_paths]
        dictionaries = [yaml.load(dict_file) for dict_file in files]
        map(lambda x: x.close(), files)
        self.dictionary = {}
        self.max_key_size = 0
        for curr_dict in dictionaries:
            for key in curr_dict:
                if key in self.dictionary:
                    self.dictionary[key].extend(curr_dict[key])
                else:
                    self.dictionary[key] = curr_dict[key]
                    self.max_key_size = max(self.max_key_size, len(key))

    def tag(self, postagged_sentences):
        return [self.tag_sentence(sentence) for sentence in postagged_sentences]

    def tag_sentence(self, sentence, tag_with_lemmas=False):
        """
        the result is only one tagging of all the possible ones.
        The resulting tagging is determined by these two priority rules:
            - longest matches have higher priority
            - search is made from left to right
        """
        tag_sentence = []
        N = len(sentence)
        if self.max_key_size == 0:
            self.max_key_size = N
        i = 0
        while (i < N):
            j = min(i + self.max_key_size, N) #avoid overflow
            tagged = False
            while (j > i):
                expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
                expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
                if tag_with_lemmas:
                    literal = expression_lemma
                else:
                    literal = expression_form
                if literal in self.dictionary:
                    #self.logger.debug("found: %s" % literal)
                    is_single_token = j - i == 1
                    original_position = i
                    i = j
                    taggings = [tag for tag in self.dictionary[literal]]
                    tagged_expression = (expression_form, expression_lemma, taggings)
                    if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
                        original_token_tagging = sentence[original_position][2]
                        tagged_expression[2].extend(original_token_tagging)
                    tag_sentence.append(tagged_expression)
                    tagged = True
                else:
                    j = j - 1
            if not tagged:
                tag_sentence.append(sentence[i])
                i += 1
        return tag_sentence

## tagging_inc_dec.py
dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml', 'dicts/inc.yml', 'dicts/dec.yml'])

dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)

pprint(dict_tagged_sentences)
[[('What', 'What', ['WP']),
  ('can', 'can', ['MD']),
  ('I', 'I', ['PRP']),
  ('say', 'say', ['VB']),
  ('about', 'about', ['IN']),
  ('this', 'this', ['DT']),
  ('place', 'place', ['NN']),
  ('.', '.', ['.'])],
 [('The', 'The', ['DT']),
  ('staff', 'staff', ['NN']),
  ('of', 'of', ['IN']),
  ('the', 'the', ['DT']),
  ('restaurant', 'restaurant', ['NN']),
  ('is', 'is', ['VBZ']),
  ('nice', 'nice', ['positive', 'JJ']),
  ('and', 'and', ['CC']),
  ('eggplant', 'eggplant', ['NN']),
  ('is', 'is', ['VBZ']),
  ('not', 'not', ['RB']),
  ('bad', 'bad', ['negative', 'JJ']),
  ('.', '.', ['.'])],
 [('apart', 'apart', ['NN']),
  ('from', 'from', ['IN']),
  ('that', 'that', ['DT']),
  (',', ',', [',']),
  ('very', 'very', ['inc', 'RB']),
  ('uninspired', 'uninspired', ['negative', 'VBN']),
  ('food', 'food', ['NN']),
  (',', ',', [',']),
  ('lack', 'lack', ['NN']),
  ('of', 'of', ['IN']),
  ('atmosphere', 'atmosphere', ['NN']),
  ('and', 'and', ['CC']),
  ('too', 'too', ['inc', 'RB']),
  ('expensive', 'expensive', ['negative', 'JJ']),
  ('.', '.', ['.'])],
 [('I', 'I', ['PRP']),
  ('am', 'am', ['VBP']),
  ('a', 'a', ['DT']),
  ('staunch', 'staunch', ['NN']),
  ('vegetarian', 'vegetarian', ['NN']),
  ('and', 'and', ['CC']),
  ('was', 'was', ['VBD']),
  ('sorely', 'sorely', ['inc', 'RB']),
  ('dissapointed', 'dissapointed', ['negative', 'VBN']),
  ('with', 'with', ['IN']),
  ('the', 'the', ['DT']),
  ('veggie', 'veggie', ['NN']),
  ('options', 'options', ['NNS']),
  ('on', 'on', ['IN']),
  ('the', 'the', ['DT']),
  ('menu', 'menu', ['NN']),
  ('.', '.', ['.'])],
 [('Will', 'Will', ['NNP']),
  ('be', 'be', ['VB']),
  ('the', 'the', ['DT']),
  ('last', 'last', ['JJ']),
  ('time', 'time', ['NN']),
  ('I', 'I', ['PRP']),
  ('visit', 'visit', ['VBP']),
  (',', ',', [',']),
  ('I', 'I', ['PRP']),
  ('recommend others to avoid', 'recommend others to avoid', ['negative']),
  ('.', '.', ['.'])]]

## tagging_inverters.py
dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml', 'dicts/inc.yml', 'dicts/dec.yml', 'dicts/inv.yml'])

dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)

pprint(dict_tagged_sentences)
[[('What', 'What', ['WP']),
  ('can', 'can', ['MD']),
  ('I', 'I', ['PRP']),
  ('say', 'say', ['VB']),
  ('about', 'about', ['IN']),
  ('this', 'this', ['DT']),
  ('place', 'place', ['NN']),
  ('.', '.', ['.'])],
 [('The', 'The', ['DT']),
  ('staff', 'staff', ['NN']),
  ('of', 'of', ['IN']),
  ('the', 'the', ['DT']),
  ('restaurant', 'restaurant', ['NN']),
  ('is', 'is', ['VBZ']),
  ('nice', 'nice', ['positive', 'JJ']),
  ('and', 'and', ['CC']),
  ('eggplant', 'eggplant', ['NN']),
  ('is', 'is', ['VBZ']),
  ('not', 'not', ['inv', 'RB']),
  ('bad', 'bad', ['negative', 'JJ']),
  ('.', '.', ['.'])],
 [('apart', 'apart', ['NN']),
  ('from', 'from', ['IN']),
  ('that', 'that', ['DT']),
  (',', ',', [',']),
  ('very', 'very', ['inc', 'RB']),
  ('uninspired', 'uninspired', ['negative', 'VBN']),
  ('food', 'food', ['NN']),
  (',', ',', [',']),
  ('lack of', 'lack of', ['inv']),
  ('atmosphere', 'atmosphere', ['NN']),
  ('and', 'and', ['CC']),
  ('too', 'too', ['inc', 'RB']),
  ('expensive', 'expensive', ['negative', 'JJ']),
  ('.', '.', ['.'])],
 [('I', 'I', ['PRP']),
  ('am', 'am', ['VBP']),
  ('a', 'a', ['DT']),
  ('staunch', 'staunch', ['NN']),
  ('vegetarian', 'vegetarian', ['NN']),
  ('and', 'and', ['CC']),
  ('was', 'was', ['VBD']),
  ('sorely', 'sorely', ['inc', 'RB']),
  ('dissapointed', 'dissapointed', ['negative', 'VBN']),
  ('with', 'with', ['IN']),
  ('the', 'the', ['DT']),
  ('veggie', 'veggie', ['NN']),
  ('options', 'options', ['NNS']),
  ('on', 'on', ['IN']),
  ('the', 'the', ['DT']),
  ('menu', 'menu', ['NN']),
  ('.', '.', ['.'])],
 [('Will', 'Will', ['NNP']),
  ('be', 'be', ['VB']),
  ('the', 'the', ['DT']),
  ('last', 'last', ['JJ']),
  ('time', 'time', ['NN']),
  ('I', 'I', ['PRP']),
  ('visit', 'visit', ['VBP']),
  (',', ',', [',']),
  ('I', 'I', ['PRP']),
  ('recommend others to avoid', 'recommend others to avoid', ['negative']),
  ('.', '.', ['.'])]]

## tagging_positive_negative.py
dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml'])

dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)

pprint(dict_tagged_sentences)
[[('What', 'What', ['WP']),
  ('can', 'can', ['MD']),
  ('I', 'I', ['PRP']),
  ('say', 'say', ['VB']),
  ('about', 'about', ['IN']),
  ('this', 'this', ['DT']),
  ('place', 'place', ['NN']),
  ('.', '.', ['.'])],
 [('The', 'The', ['DT']),
  ('staff', 'staff', ['NN']),
  ('of', 'of', ['IN']),
  ('the', 'the', ['DT']),
  ('restaurant', 'restaurant', ['NN']),
  ('is', 'is', ['VBZ']),
  ('nice', 'nice', ['positive', 'JJ']),
  ('and', 'and', ['CC']),
  ('eggplant', 'eggplant', ['NN']),
  ('is', 'is', ['VBZ']),
  ('not', 'not', ['RB']),
  ('bad', 'bad', ['negative', 'JJ']),
  ('.', '.', ['.'])],
 [('apart', 'apart', ['NN']),
  ('from', 'from', ['IN']),
  ('that', 'that', ['DT']),
  (',', ',', [',']),
  ('very', 'very', ['RB']),
  ('uninspired', 'uninspired', ['negative', 'VBN']),
  ('food', 'food', ['NN']),
  (',', ',', [',']),
  ('lack', 'lack', ['NN']),
  ('of', 'of', ['IN']),
  ('atmosphere', 'atmosphere', ['NN']),
  ('and', 'and', ['CC']),
  ('too', 'too', ['RB']),
  ('expensive', 'expensive', ['negative', 'JJ']),
  ('.', '.', ['.'])],
 [('I', 'I', ['PRP']),
  ('am', 'am', ['VBP']),
  ('a', 'a', ['DT']),
  ('staunch', 'staunch', ['NN']),
  ('vegetarian', 'vegetarian', ['NN']),
  ('and', 'and', ['CC']),
  ('was', 'was', ['VBD']),
  ('sorely', 'sorely', ['RB']),
  ('dissapointed', 'dissapointed', ['negative', 'VBN']),
  ('with', 'with', ['IN']),
  ('the', 'the', ['DT']),
  ('veggie', 'veggie', ['NN']),
  ('options', 'options', ['NNS']),
  ('on', 'on', ['IN']),
  ('the', 'the', ['DT']),
  ('menu', 'menu', ['NN']),
  ('.', '.', ['.'])],
 [('Will', 'Will', ['NNP']),
  ('be', 'be', ['VB']),
  ('the', 'the', ['DT']),
  ('last', 'last', ['JJ']),
  ('time', 'time', ['NN']),
  ('I', 'I', ['PRP']),
  ('visit', 'visit', ['VBP']),
  (',', ',', [',']),
  ('I', 'I', ['PRP']),
  ('recommend others to avoid', 'recommend others to avoid', ['negative']),
  ('.', '.', ['.'])]]
	class DictionaryTagger(object):
	def __init__(self, dictionary_paths):
	files = [open(path, 'r') for path in dictionary_paths]
	dictionaries = [yaml.load(dict_file) for dict_file in files]
	map(lambda x: x.close(), files)
	self.dictionary = {}
	self.max_key_size = 0
	for curr_dict in dictionaries:
	for key in curr_dict:
	if key in self.dictionary:
	self.dictionary[key].extend(curr_dict[key])
	else:
	self.dictionary[key] = curr_dict[key]
	self.max_key_size = max(self.max_key_size, len(key))

	def tag(self, postagged_sentences):
	return [self.tag_sentence(sentence) for sentence in postagged_sentences]

	def tag_sentence(self, sentence, tag_with_lemmas=False):
	"""
	the result is only one tagging of all the possible ones.
	The resulting tagging is determined by these two priority rules:
	- longest matches have higher priority
	- search is made from left to right
	"""
	tag_sentence = []
	N = len(sentence)
	if self.max_key_size == 0:
	self.max_key_size = N
	i = 0
	while (i < N):
	j = min(i + self.max_key_size, N) #avoid overflow
	tagged = False
	while (j > i):
	expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
	expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
	if tag_with_lemmas:
	literal = expression_lemma
	else:
	literal = expression_form
	if literal in self.dictionary:
	#self.logger.debug("found: %s" % literal)
	is_single_token = j - i == 1
	original_position = i
	i = j
	taggings = [tag for tag in self.dictionary[literal]]
	tagged_expression = (expression_form, expression_lemma, taggings)
	if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
	original_token_tagging = sentence[original_position][2]
	tagged_expression[2].extend(original_token_tagging)
	tag_sentence.append(tagged_expression)
	tagged = True
	else:
	j = j - 1
	if not tagged:
	tag_sentence.append(sentence[i])
	i += 1
	return tag_sentence
	dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml', 'dicts/inc.yml', 'dicts/dec.yml'])

	dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)

	pprint(dict_tagged_sentences)
	[[('What', 'What', ['WP']),
	('can', 'can', ['MD']),
	('I', 'I', ['PRP']),
	('say', 'say', ['VB']),
	('about', 'about', ['IN']),
	('this', 'this', ['DT']),
	('place', 'place', ['NN']),
	('.', '.', ['.'])],
	[('The', 'The', ['DT']),
	('staff', 'staff', ['NN']),
	('of', 'of', ['IN']),
	('the', 'the', ['DT']),
	('restaurant', 'restaurant', ['NN']),
	('is', 'is', ['VBZ']),
	('nice', 'nice', ['positive', 'JJ']),
	('and', 'and', ['CC']),
	('eggplant', 'eggplant', ['NN']),
	('is', 'is', ['VBZ']),
	('not', 'not', ['RB']),
	('bad', 'bad', ['negative', 'JJ']),
	('.', '.', ['.'])],
	[('apart', 'apart', ['NN']),
	('from', 'from', ['IN']),
	('that', 'that', ['DT']),
	(',', ',', [',']),
	('very', 'very', ['inc', 'RB']),
	('uninspired', 'uninspired', ['negative', 'VBN']),
	('food', 'food', ['NN']),
	(',', ',', [',']),
	('lack', 'lack', ['NN']),
	('of', 'of', ['IN']),
	('atmosphere', 'atmosphere', ['NN']),
	('and', 'and', ['CC']),
	('too', 'too', ['inc', 'RB']),
	('expensive', 'expensive', ['negative', 'JJ']),
	('.', '.', ['.'])],
	[('I', 'I', ['PRP']),
	('am', 'am', ['VBP']),
	('a', 'a', ['DT']),
	('staunch', 'staunch', ['NN']),
	('vegetarian', 'vegetarian', ['NN']),
	('and', 'and', ['CC']),
	('was', 'was', ['VBD']),
	('sorely', 'sorely', ['inc', 'RB']),
	('dissapointed', 'dissapointed', ['negative', 'VBN']),
	('with', 'with', ['IN']),
	('the', 'the', ['DT']),
	('veggie', 'veggie', ['NN']),
	('options', 'options', ['NNS']),
	('on', 'on', ['IN']),
	('the', 'the', ['DT']),
	('menu', 'menu', ['NN']),
	('.', '.', ['.'])],
	[('Will', 'Will', ['NNP']),
	('be', 'be', ['VB']),
	('the', 'the', ['DT']),
	('last', 'last', ['JJ']),
	('time', 'time', ['NN']),
	('I', 'I', ['PRP']),
	('visit', 'visit', ['VBP']),
	(',', ',', [',']),
	('I', 'I', ['PRP']),
	('recommend others to avoid', 'recommend others to avoid', ['negative']),
	('.', '.', ['.'])]]