gavinmh/featurizer_sub.py

## featurizer_sub.py
from __future__ import division
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk.metrics import edit_distance
from nltk.corpus.reader.wordnet import WordNetError
import numpy as np
import logging, os
import Alignment_sub


class Lexent_featurizer_sub:

    def __init__(self):
        logging.basicConfig(level=logging.DEBUG)

        self.brown_ic = wordnet_ic.ic('ic-brown.dat')
        self.semcor_ic = wordnet_ic.ic('ic-semcor.dat')

        filename = os.path.join(os.path.dirname(__file__), 'resources/verb_nom_tuples.txt')
        with open(filename) as f:
            self.nom_adj_verb_tuples = f.readlines()
        self.stoplist = []
        stoplistFile = os.path.join(os.path.dirname(__file__), 'resources/stoplist.txt')
        with open(stoplistFile) as f:
            self.stoplist = f.read().splitlines()
        self.prepositions = []
        prepositionsFile = os.path.join(os.path.dirname(__file__), 'resources/prepositions.txt')
        with open(prepositionsFile) as f:
            self.prepositions = f.read().splitlines()
        self.pronouns = []
        pronounsFile = os.path.join(os.path.dirname(__file__), 'resources/pronouns.txt')
        with open(pronounsFile) as f:
            self.pronouns = f.read().splitlines()

    def getNomB(self, alignment):
        h = alignment.h_token
        p = alignment.p_token
        pair1 = h + ',' + p + '\n'
        pair2 = p + ',' + h + '\n'
        if pair1 in self.nom_adj_verb_tuples:
            return 0.75
        elif pair2 in self.nom_adj_verb_tuples:
            return 0.75
        return 0


    def getNNNN(self, alignment):
        noun_types= ['NN','NNS']
        proper_noun_types = ['NNP', 'NNPS']
        h_misses = [tag for tag in alignment.h_penn_tag if tag not in noun_types]
        p_misses = [tag for tag in alignment.p_penn_tag if tag not in noun_types]
        if len(h_misses) == 0 and len(p_misses) == 0:
            return 1
        h_misses = [tag for tag in alignment.h_penn_tag if tag not in proper_noun_types]
        p_misses = [tag for tag in alignment.p_penn_tag if tag not in proper_noun_types]
        if len(h_misses) == 0 and len(p_misses) == 0:
            return 1
        return 0


    def getPronoun(self, alignment):
        if alignment.h_token in self.pronouns and alignment.p_token in self.pronouns:
            return 1
        return 0


    def getLemStrSim(self, alignment):
        p = alignment.p_token
        h = alignment.h_token
        distance = edit_distance(h,p)
        max_length = max(len(h),len(p))
        score = 1 - (distance / (max_length - 2.000000001))
        return max(0, score)


    def getLight(self, alignment):
        if alignment.h_token in self.stoplist and alignment.p_token in self.stoplist:
            return 1
        return 0

    def getPreps(self, alignment):
        if alignment.h_token in self.prepositions and alignment.p_token in self.prepositions:
            return 1
        return 0


    def contains(self, small, big):
        for i in xrange(len(big)-len(small)+1):
            for j in xrange(len(small)):
                if big[i+j] != small[j]:
                    break
            else:
                #return i, i+len(small)
                return True
        return False

    def phrase_contains(self, p,h):
        if self.contains(p,h) or self.contains(h,p):
            return True
        return False


    # TODO lemmas should be passed to the following
    # TODO or should they lemmatize tokens?
    def getLemSubSeqF(self, alignment):
        p = alignment.p_token
        h = alignment.h_token
        if h in p and p != h:
            return 1
        return 0


    def getLemSubSeqR(self, alignment):
        p = alignment.p_token
        h = alignment.h_token
        if p in h and p != h:
            return 1
        return 0


    def getLemSubSeqE(self, alignment):
        if alignment.h_token == alignment.p_token:
            return 1
        return 0


    def getLemSubSeqN(self, alignment):
        p = alignment.p_token
        h = alignment.h_token
        if p != h \
        and p not in h \
        and h not in p:
            return 1
        return 0


    def getDLin(self, pSynsets, hSynsets):
        scores = [0]
        for p_synset in pSynsets:
            for h_synset in hSynsets:
                try:
                    brown_score = p_synset.lin_similarity(h_synset, self.brown_ic)
                    scores.append(min(brown_score,1))
                    logging.info('DLin: %s. %s: %s' % (p_synset, h_synset, brown_score))
                except WordNetError:
                    pass
                try:
                    semcor_score = p_synset.lin_similarity(h_synset, self.semcor_ic)
                    scores.append(min(semcor_score,1))
                    logging.info('DLin: %s. %s: %s' % (p_synset, h_synset, semcor_score))
                except WordNetError:
                    pass
        return max(scores)


    # TODO if both synsets contain same synset, should that score be counted?
    def getJiCo(self, pSynsets, hSynsets):
        scores = [0]
        for p_synset in pSynsets:
            for h_synset in hSynsets:
                try:
                    brown_score = p_synset.jcn_similarity(h_synset, self.brown_ic)
                    scores.append(min(brown_score,1))
                    logging.info('Brown score: %s, %s: %s' % (p_synset, h_synset, brown_score))
                except WordNetError:
                    pass
                try:
                    semcor_score = p_synset.jcn_similarity(h_synset, self.semcor_ic)
                    scores.append(min(semcor_score,1))
                    logging.info('Semcor score: %s, %s: %s' % (p_synset, h_synset, semcor_score))
                except WordNetError:
                    pass
        return max(scores)


    def getWNHyper(self, pSynsets, hSynsets):
        path_distances = []
        for p_synset in pSynsets:
            logging.info('p synset is ' + str(p_synset))
            p_hypernyms = p_synset.hypernym_distances()
            for h_synset in hSynsets:
                logging.info('h synset is ' + str(h_synset))
                if h_synset in [synset_dist_tuple[0] for synset_dist_tuple in p_hypernyms]:
                    for synset in p_hypernyms:
                        if synset[0] == h_synset:
                            logging.info('Found h as hypernym of p: ' + str(synset))
                            if synset[1] != 0:
                                path_distances.append(synset[1])
        if len(path_distances) > 0:
            logging.info(path_distances)
            shortest_path = min(path_distances)
            score = 1 - (shortest_path / 8)
            return score
        else:
            return 0


    def getWNHypo(self, pSynsets, hSynsets):
        path_distances = []
        for h_synset in hSynsets:
            logging.info('h synset is ' + str(h_synset))
            h_hypernyms = h_synset.hypernym_distances()
            for p_synset in pSynsets:
                logging.info('p synset is ' + str(p_synset))
                if p_synset in [synset_dist_tuple[0] for synset_dist_tuple in h_hypernyms]:
                    for synset in h_hypernyms:
                        if synset[0] == p_synset:
                            logging.info('Found p as hypernym of h: ' + str(synset))
                            if synset[1] != 0:
                                path_distances.append(synset[1])
        if len(path_distances) > 0:
            logging.info(path_distances)
            shortest_path = min(path_distances)
            score = 1 - (shortest_path / 8)
            return score
        else:
            return 0


    def getWNAnt(self, alignment, p_synsets, h_synsets):
        # Antonyms of h
        h_antonym_synsets = [l.antonyms() for s in h_synsets for l in s.lemmas]
        h_antonyms = []
        for lemma_list in h_antonym_synsets:
            for lemma in lemma_list:
                h_antonyms += [name for name in lemma.synset.lemma_names]
        print '\nH: %s\nants:\n%s' % (h_synsets, h_antonyms)

        # Antonyms of p
        p_antonym_synsets = [l.antonyms() for s in p_synsets for l in s.lemmas]
        p_antonyms = []
        for lemma_list in p_antonym_synsets:
            for lemma in lemma_list:
                p_antonyms += [name for name in lemma.synset.lemma_names]
        print '\nP: %s\nants:\n%s' % (p_synsets, p_antonyms)

        # Synonyms of p
        p_lemmas = []
        for synset in p_synsets:
            p_lemmas += synset.lemma_names
        print '\nP synonyms:\n%s' % p_lemmas
        # Synonyms of h
        h_lemmas = []
        for synset in h_synsets:
            h_lemmas += synset.lemma_names
        print '\nH synonyms:\n%s' % h_lemmas

        for p_synonym in p_lemmas:
            if p_synonym in h_antonyms:
                print 'ANTONYM: %s' % (p_synonym)
                return 1
        for h_synonym in h_lemmas:
            if h_synonym in p_antonyms:
                print 'ANTONYM: %s' % (h_synonym)
                return 1
        return 0


    # return 1 if p and h are synonyms
    def getWNSyn(self, alignment, pSynsets, hSynsets):
        h_synonyms = [l.name for s in hSynsets for l in s.lemmas]
        p_synonyms = [l.name for s in pSynsets for l in s.lemmas]
        logging.info(h_synonyms)
        logging.info(p_synonyms)
        if alignment.h_token in p_synonyms:
            logging.info('h is a synonym of p')
            return 1
        elif alignment.p_token in h_synonyms:
            logging.info('p is a synonym of h')
            return 1
        else:
            logging.info('p and h are not synonyms')
            return 0

    def getFeatures(self, alignment):
        print '%s %s' % (alignment.p_token, alignment.h_token)
        print 'POS: %s' % alignment.get_p_wn_tag()
        pSynsets = []
        if alignment.p_wn_tag != 'SKIP':
            pSynsets = wn.synsets(alignment.p_token, pos=alignment.p_wn_tag)
        print 'here'
        hSynsets = []
        if alignment.h_wn_tag != 'SKIP':
            hSynsets = wn.synsets(alignment.h_token, pos=alignment.h_wn_tag)
        features = np.zeros(16, dtype=float)
        features[0] = self.getWNSyn(alignment, pSynsets, hSynsets)
        features[1] = 0 # TODO TODO
        features[2] = self.getWNHyper(pSynsets, hSynsets)
        features[3] = self.getWNHypo(pSynsets, hSynsets)
        features[4] = self.getJiCo(pSynsets, hSynsets)
        features[5] = self.getDLin(pSynsets, hSynsets)
        features[6] = self.getLemSubSeqF(alignment)
        features[7] = self.getLemSubSeqR(alignment)
        features[8] = self.getLemSubSeqE(alignment)
        features[9] = self.getLemSubSeqN(alignment)
        features[10] = self.getLight(alignment)
        features[11] = self.getPreps(alignment)
        features[12] = self.getPronoun(alignment)
        features[13] = self.getLemStrSim(alignment)
        features[14] = self.getNNNN(alignment)
        features[15] = self.getNomB(alignment)
        return features


if __name__ == '__main__':
    edit1 = Alignment_sub.Alignment_sub('happy', 'JJ', 'sad', 'JJ')
    p_synsets = wn.synsets(edit1.p_token, pos=edit1.p_wn_tag)
    h_synsets = wn.synsets(edit1.h_token, pos=edit1.h_wn_tag)
    featurizer = Lexent_featurizer_sub()
    ant_feature = featurizer.getWNAnt(edit1, p_synsets, h_synsets)
    print 'Ant: %s' % ant_feature
	from __future__ import division
	from nltk.corpus import wordnet as wn
	from nltk.corpus import wordnet_ic
	from nltk.metrics import edit_distance
	from nltk.corpus.reader.wordnet import WordNetError
	import numpy as np
	import logging, os
	import Alignment_sub


	class Lexent_featurizer_sub:

	def __init__(self):
	logging.basicConfig(level=logging.DEBUG)

	self.brown_ic = wordnet_ic.ic('ic-brown.dat')
	self.semcor_ic = wordnet_ic.ic('ic-semcor.dat')

	filename = os.path.join(os.path.dirname(__file__), 'resources/verb_nom_tuples.txt')
	with open(filename) as f:
	self.nom_adj_verb_tuples = f.readlines()
	self.stoplist = []
	stoplistFile = os.path.join(os.path.dirname(__file__), 'resources/stoplist.txt')
	with open(stoplistFile) as f:
	self.stoplist = f.read().splitlines()
	self.prepositions = []
	prepositionsFile = os.path.join(os.path.dirname(__file__), 'resources/prepositions.txt')
	with open(prepositionsFile) as f:
	self.prepositions = f.read().splitlines()
	self.pronouns = []
	pronounsFile = os.path.join(os.path.dirname(__file__), 'resources/pronouns.txt')
	with open(pronounsFile) as f:
	self.pronouns = f.read().splitlines()

	def getNomB(self, alignment):
	h = alignment.h_token
	p = alignment.p_token
	pair1 = h + ',' + p + '\n'
	pair2 = p + ',' + h + '\n'
	if pair1 in self.nom_adj_verb_tuples:
	return 0.75
	elif pair2 in self.nom_adj_verb_tuples:
	return 0.75
	return 0


	def getNNNN(self, alignment):
	noun_types= ['NN','NNS']
	proper_noun_types = ['NNP', 'NNPS']
	h_misses = [tag for tag in alignment.h_penn_tag if tag not in noun_types]
	p_misses = [tag for tag in alignment.p_penn_tag if tag not in noun_types]
	if len(h_misses) == 0 and len(p_misses) == 0:
	return 1
	h_misses = [tag for tag in alignment.h_penn_tag if tag not in proper_noun_types]
	p_misses = [tag for tag in alignment.p_penn_tag if tag not in proper_noun_types]
	if len(h_misses) == 0 and len(p_misses) == 0:
	return 1
	return 0


	def getPronoun(self, alignment):
	if alignment.h_token in self.pronouns and alignment.p_token in self.pronouns:
	return 1
	return 0


	def getLemStrSim(self, alignment):
	p = alignment.p_token
	h = alignment.h_token
	distance = edit_distance(h,p)
	max_length = max(len(h),len(p))
	score = 1 - (distance / (max_length - 2.000000001))
	return max(0, score)


	def getLight(self, alignment):
	if alignment.h_token in self.stoplist and alignment.p_token in self.stoplist:
	return 1
	return 0

	def getPreps(self, alignment):
	if alignment.h_token in self.prepositions and alignment.p_token in self.prepositions:
	return 1
	return 0


	def contains(self, small, big):
	for i in xrange(len(big)-len(small)+1):
	for j in xrange(len(small)):
	if big[i+j] != small[j]:
	break
	else:
	#return i, i+len(small)
	return True
	return False

	def phrase_contains(self, p,h):
	if self.contains(p,h) or self.contains(h,p):
	return True
	return False


	# TODO lemmas should be passed to the following
	# TODO or should they lemmatize tokens?
	def getLemSubSeqF(self, alignment):
	p = alignment.p_token
	h = alignment.h_token
	if h in p and p != h:
	return 1
	return 0


	def getLemSubSeqR(self, alignment):
	p = alignment.p_token
	h = alignment.h_token
	if p in h and p != h:
	return 1
	return 0


	def getLemSubSeqE(self, alignment):
	if alignment.h_token == alignment.p_token:
	return 1
	return 0


	def getLemSubSeqN(self, alignment):
	p = alignment.p_token
	h = alignment.h_token
	if p != h \
	and p not in h \
	and h not in p:
	return 1
	return 0


	def getDLin(self, pSynsets, hSynsets):
	scores = [0]
	for p_synset in pSynsets:
	for h_synset in hSynsets:
	try:
	brown_score = p_synset.lin_similarity(h_synset, self.brown_ic)
	scores.append(min(brown_score,1))
	logging.info('DLin: %s. %s: %s' % (p_synset, h_synset, brown_score))
	except WordNetError:
	pass
	try:
	semcor_score = p_synset.lin_similarity(h_synset, self.semcor_ic)
	scores.append(min(semcor_score,1))
	logging.info('DLin: %s. %s: %s' % (p_synset, h_synset, semcor_score))
	except WordNetError:
	pass
	return max(scores)


	# TODO if both synsets contain same synset, should that score be counted?
	def getJiCo(self, pSynsets, hSynsets):
	scores = [0]
	for p_synset in pSynsets:
	for h_synset in hSynsets:
	try:
	brown_score = p_synset.jcn_similarity(h_synset, self.brown_ic)
	scores.append(min(brown_score,1))
	logging.info('Brown score: %s, %s: %s' % (p_synset, h_synset, brown_score))
	except WordNetError:
	pass
	try:
	semcor_score = p_synset.jcn_similarity(h_synset, self.semcor_ic)
	scores.append(min(semcor_score,1))
	logging.info('Semcor score: %s, %s: %s' % (p_synset, h_synset, semcor_score))
	except WordNetError:
	pass
	return max(scores)


	def getWNHyper(self, pSynsets, hSynsets):
	path_distances = []
	for p_synset in pSynsets:
	logging.info('p synset is ' + str(p_synset))
	p_hypernyms = p_synset.hypernym_distances()
	for h_synset in hSynsets:
	logging.info('h synset is ' + str(h_synset))
	if h_synset in [synset_dist_tuple[0] for synset_dist_tuple in p_hypernyms]:
	for synset in p_hypernyms:
	if synset[0] == h_synset:
	logging.info('Found h as hypernym of p: ' + str(synset))
	if synset[1] != 0:
	path_distances.append(synset[1])
	if len(path_distances) > 0:
	logging.info(path_distances)
	shortest_path = min(path_distances)
	score = 1 - (shortest_path / 8)
	return score
	else:
	return 0


	def getWNHypo(self, pSynsets, hSynsets):
	path_distances = []
	for h_synset in hSynsets:
	logging.info('h synset is ' + str(h_synset))
	h_hypernyms = h_synset.hypernym_distances()
	for p_synset in pSynsets:
	logging.info('p synset is ' + str(p_synset))
	if p_synset in [synset_dist_tuple[0] for synset_dist_tuple in h_hypernyms]:
	for synset in h_hypernyms:
	if synset[0] == p_synset:
	logging.info('Found p as hypernym of h: ' + str(synset))
	if synset[1] != 0:
	path_distances.append(synset[1])
	if len(path_distances) > 0:
	logging.info(path_distances)
	shortest_path = min(path_distances)
	score = 1 - (shortest_path / 8)
	return score
	else:
	return 0


	def getWNAnt(self, alignment, p_synsets, h_synsets):
	# Antonyms of h
	h_antonym_synsets = [l.antonyms() for s in h_synsets for l in s.lemmas]
	h_antonyms = []
	for lemma_list in h_antonym_synsets:
	for lemma in lemma_list:
	h_antonyms += [name for name in lemma.synset.lemma_names]
	print '\nH: %s\nants:\n%s' % (h_synsets, h_antonyms)

	# Antonyms of p
	p_antonym_synsets = [l.antonyms() for s in p_synsets for l in s.lemmas]
	p_antonyms = []
	for lemma_list in p_antonym_synsets:
	for lemma in lemma_list:
	p_antonyms += [name for name in lemma.synset.lemma_names]
	print '\nP: %s\nants:\n%s' % (p_synsets, p_antonyms)

	# Synonyms of p
	p_lemmas = []
	for synset in p_synsets:
	p_lemmas += synset.lemma_names
	print '\nP synonyms:\n%s' % p_lemmas
	# Synonyms of h
	h_lemmas = []
	for synset in h_synsets:
	h_lemmas += synset.lemma_names
	print '\nH synonyms:\n%s' % h_lemmas

	for p_synonym in p_lemmas:
	if p_synonym in h_antonyms:
	print 'ANTONYM: %s' % (p_synonym)
	return 1
	for h_synonym in h_lemmas:
	if h_synonym in p_antonyms:
	print 'ANTONYM: %s' % (h_synonym)
	return 1
	return 0



	# return 1 if p and h are synonyms
	def getWNSyn(self, alignment, pSynsets, hSynsets):
	h_synonyms = [l.name for s in hSynsets for l in s.lemmas]
	p_synonyms = [l.name for s in pSynsets for l in s.lemmas]
	logging.info(h_synonyms)
	logging.info(p_synonyms)
	if alignment.h_token in p_synonyms:
	logging.info('h is a synonym of p')
	return 1
	elif alignment.p_token in h_synonyms:
	logging.info('p is a synonym of h')
	return 1
	else:
	logging.info('p and h are not synonyms')
	return 0

	def getFeatures(self, alignment):
	print '%s %s' % (alignment.p_token, alignment.h_token)
	print 'POS: %s' % alignment.get_p_wn_tag()
	pSynsets = []
	if alignment.p_wn_tag != 'SKIP':
	pSynsets = wn.synsets(alignment.p_token, pos=alignment.p_wn_tag)
	print 'here'
	hSynsets = []
	if alignment.h_wn_tag != 'SKIP':
	hSynsets = wn.synsets(alignment.h_token, pos=alignment.h_wn_tag)
	features = np.zeros(16, dtype=float)
	features[0] = self.getWNSyn(alignment, pSynsets, hSynsets)
	features[1] = 0 # TODO TODO
	features[2] = self.getWNHyper(pSynsets, hSynsets)
	features[3] = self.getWNHypo(pSynsets, hSynsets)
	features[4] = self.getJiCo(pSynsets, hSynsets)
	features[5] = self.getDLin(pSynsets, hSynsets)
	features[6] = self.getLemSubSeqF(alignment)
	features[7] = self.getLemSubSeqR(alignment)
	features[8] = self.getLemSubSeqE(alignment)
	features[9] = self.getLemSubSeqN(alignment)
	features[10] = self.getLight(alignment)
	features[11] = self.getPreps(alignment)
	features[12] = self.getPronoun(alignment)
	features[13] = self.getLemStrSim(alignment)
	features[14] = self.getNNNN(alignment)
	features[15] = self.getNomB(alignment)
	return features


	if __name__ == '__main__':
	edit1 = Alignment_sub.Alignment_sub('happy', 'JJ', 'sad', 'JJ')
	p_synsets = wn.synsets(edit1.p_token, pos=edit1.p_wn_tag)
	h_synsets = wn.synsets(edit1.h_token, pos=edit1.h_wn_tag)
	featurizer = Lexent_featurizer_sub()
	ant_feature = featurizer.getWNAnt(edit1, p_synsets, h_synsets)
	print 'Ant: %s' % ant_feature