Created
December 4, 2012 01:59
-
-
Save gavinmh/4199851 to your computer and use it in GitHub Desktop.
Lexical entailment featurizer for substitution edits
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
from nltk.corpus import wordnet as wn | |
from nltk.corpus import wordnet_ic | |
from nltk.metrics import edit_distance | |
from nltk.corpus.reader.wordnet import WordNetError | |
import numpy as np | |
import logging, os | |
import Alignment_sub | |
class Lexent_featurizer_sub: | |
def __init__(self): | |
logging.basicConfig(level=logging.DEBUG) | |
self.brown_ic = wordnet_ic.ic('ic-brown.dat') | |
self.semcor_ic = wordnet_ic.ic('ic-semcor.dat') | |
filename = os.path.join(os.path.dirname(__file__), 'resources/verb_nom_tuples.txt') | |
with open(filename) as f: | |
self.nom_adj_verb_tuples = f.readlines() | |
self.stoplist = [] | |
stoplistFile = os.path.join(os.path.dirname(__file__), 'resources/stoplist.txt') | |
with open(stoplistFile) as f: | |
self.stoplist = f.read().splitlines() | |
self.prepositions = [] | |
prepositionsFile = os.path.join(os.path.dirname(__file__), 'resources/prepositions.txt') | |
with open(prepositionsFile) as f: | |
self.prepositions = f.read().splitlines() | |
self.pronouns = [] | |
pronounsFile = os.path.join(os.path.dirname(__file__), 'resources/pronouns.txt') | |
with open(pronounsFile) as f: | |
self.pronouns = f.read().splitlines() | |
def getNomB(self, alignment): | |
h = alignment.h_token | |
p = alignment.p_token | |
pair1 = h + ',' + p + '\n' | |
pair2 = p + ',' + h + '\n' | |
if pair1 in self.nom_adj_verb_tuples: | |
return 0.75 | |
elif pair2 in self.nom_adj_verb_tuples: | |
return 0.75 | |
return 0 | |
def getNNNN(self, alignment): | |
noun_types= ['NN','NNS'] | |
proper_noun_types = ['NNP', 'NNPS'] | |
h_misses = [tag for tag in alignment.h_penn_tag if tag not in noun_types] | |
p_misses = [tag for tag in alignment.p_penn_tag if tag not in noun_types] | |
if len(h_misses) == 0 and len(p_misses) == 0: | |
return 1 | |
h_misses = [tag for tag in alignment.h_penn_tag if tag not in proper_noun_types] | |
p_misses = [tag for tag in alignment.p_penn_tag if tag not in proper_noun_types] | |
if len(h_misses) == 0 and len(p_misses) == 0: | |
return 1 | |
return 0 | |
def getPronoun(self, alignment): | |
if alignment.h_token in self.pronouns and alignment.p_token in self.pronouns: | |
return 1 | |
return 0 | |
def getLemStrSim(self, alignment): | |
p = alignment.p_token | |
h = alignment.h_token | |
distance = edit_distance(h,p) | |
max_length = max(len(h),len(p)) | |
score = 1 - (distance / (max_length - 2.000000001)) | |
return max(0, score) | |
def getLight(self, alignment): | |
if alignment.h_token in self.stoplist and alignment.p_token in self.stoplist: | |
return 1 | |
return 0 | |
def getPreps(self, alignment): | |
if alignment.h_token in self.prepositions and alignment.p_token in self.prepositions: | |
return 1 | |
return 0 | |
def contains(self, small, big): | |
for i in xrange(len(big)-len(small)+1): | |
for j in xrange(len(small)): | |
if big[i+j] != small[j]: | |
break | |
else: | |
#return i, i+len(small) | |
return True | |
return False | |
def phrase_contains(self, p,h): | |
if self.contains(p,h) or self.contains(h,p): | |
return True | |
return False | |
# TODO lemmas should be passed to the following | |
# TODO or should they lemmatize tokens? | |
def getLemSubSeqF(self, alignment): | |
p = alignment.p_token | |
h = alignment.h_token | |
if h in p and p != h: | |
return 1 | |
return 0 | |
def getLemSubSeqR(self, alignment): | |
p = alignment.p_token | |
h = alignment.h_token | |
if p in h and p != h: | |
return 1 | |
return 0 | |
def getLemSubSeqE(self, alignment): | |
if alignment.h_token == alignment.p_token: | |
return 1 | |
return 0 | |
def getLemSubSeqN(self, alignment): | |
p = alignment.p_token | |
h = alignment.h_token | |
if p != h \ | |
and p not in h \ | |
and h not in p: | |
return 1 | |
return 0 | |
def getDLin(self, pSynsets, hSynsets): | |
scores = [0] | |
for p_synset in pSynsets: | |
for h_synset in hSynsets: | |
try: | |
brown_score = p_synset.lin_similarity(h_synset, self.brown_ic) | |
scores.append(min(brown_score,1)) | |
logging.info('DLin: %s. %s: %s' % (p_synset, h_synset, brown_score)) | |
except WordNetError: | |
pass | |
try: | |
semcor_score = p_synset.lin_similarity(h_synset, self.semcor_ic) | |
scores.append(min(semcor_score,1)) | |
logging.info('DLin: %s. %s: %s' % (p_synset, h_synset, semcor_score)) | |
except WordNetError: | |
pass | |
return max(scores) | |
# TODO if both synsets contain same synset, should that score be counted? | |
def getJiCo(self, pSynsets, hSynsets): | |
scores = [0] | |
for p_synset in pSynsets: | |
for h_synset in hSynsets: | |
try: | |
brown_score = p_synset.jcn_similarity(h_synset, self.brown_ic) | |
scores.append(min(brown_score,1)) | |
logging.info('Brown score: %s, %s: %s' % (p_synset, h_synset, brown_score)) | |
except WordNetError: | |
pass | |
try: | |
semcor_score = p_synset.jcn_similarity(h_synset, self.semcor_ic) | |
scores.append(min(semcor_score,1)) | |
logging.info('Semcor score: %s, %s: %s' % (p_synset, h_synset, semcor_score)) | |
except WordNetError: | |
pass | |
return max(scores) | |
def getWNHyper(self, pSynsets, hSynsets): | |
path_distances = [] | |
for p_synset in pSynsets: | |
logging.info('p synset is ' + str(p_synset)) | |
p_hypernyms = p_synset.hypernym_distances() | |
for h_synset in hSynsets: | |
logging.info('h synset is ' + str(h_synset)) | |
if h_synset in [synset_dist_tuple[0] for synset_dist_tuple in p_hypernyms]: | |
for synset in p_hypernyms: | |
if synset[0] == h_synset: | |
logging.info('Found h as hypernym of p: ' + str(synset)) | |
if synset[1] != 0: | |
path_distances.append(synset[1]) | |
if len(path_distances) > 0: | |
logging.info(path_distances) | |
shortest_path = min(path_distances) | |
score = 1 - (shortest_path / 8) | |
return score | |
else: | |
return 0 | |
def getWNHypo(self, pSynsets, hSynsets): | |
path_distances = [] | |
for h_synset in hSynsets: | |
logging.info('h synset is ' + str(h_synset)) | |
h_hypernyms = h_synset.hypernym_distances() | |
for p_synset in pSynsets: | |
logging.info('p synset is ' + str(p_synset)) | |
if p_synset in [synset_dist_tuple[0] for synset_dist_tuple in h_hypernyms]: | |
for synset in h_hypernyms: | |
if synset[0] == p_synset: | |
logging.info('Found p as hypernym of h: ' + str(synset)) | |
if synset[1] != 0: | |
path_distances.append(synset[1]) | |
if len(path_distances) > 0: | |
logging.info(path_distances) | |
shortest_path = min(path_distances) | |
score = 1 - (shortest_path / 8) | |
return score | |
else: | |
return 0 | |
def getWNAnt(self, alignment, p_synsets, h_synsets): | |
# Antonyms of h | |
h_antonym_synsets = [l.antonyms() for s in h_synsets for l in s.lemmas] | |
h_antonyms = [] | |
for lemma_list in h_antonym_synsets: | |
for lemma in lemma_list: | |
h_antonyms += [name for name in lemma.synset.lemma_names] | |
print '\nH: %s\nants:\n%s' % (h_synsets, h_antonyms) | |
# Antonyms of p | |
p_antonym_synsets = [l.antonyms() for s in p_synsets for l in s.lemmas] | |
p_antonyms = [] | |
for lemma_list in p_antonym_synsets: | |
for lemma in lemma_list: | |
p_antonyms += [name for name in lemma.synset.lemma_names] | |
print '\nP: %s\nants:\n%s' % (p_synsets, p_antonyms) | |
# Synonyms of p | |
p_lemmas = [] | |
for synset in p_synsets: | |
p_lemmas += synset.lemma_names | |
print '\nP synonyms:\n%s' % p_lemmas | |
# Synonyms of h | |
h_lemmas = [] | |
for synset in h_synsets: | |
h_lemmas += synset.lemma_names | |
print '\nH synonyms:\n%s' % h_lemmas | |
for p_synonym in p_lemmas: | |
if p_synonym in h_antonyms: | |
print 'ANTONYM: %s' % (p_synonym) | |
return 1 | |
for h_synonym in h_lemmas: | |
if h_synonym in p_antonyms: | |
print 'ANTONYM: %s' % (h_synonym) | |
return 1 | |
return 0 | |
# return 1 if p and h are synonyms | |
def getWNSyn(self, alignment, pSynsets, hSynsets): | |
h_synonyms = [l.name for s in hSynsets for l in s.lemmas] | |
p_synonyms = [l.name for s in pSynsets for l in s.lemmas] | |
logging.info(h_synonyms) | |
logging.info(p_synonyms) | |
if alignment.h_token in p_synonyms: | |
logging.info('h is a synonym of p') | |
return 1 | |
elif alignment.p_token in h_synonyms: | |
logging.info('p is a synonym of h') | |
return 1 | |
else: | |
logging.info('p and h are not synonyms') | |
return 0 | |
def getFeatures(self, alignment): | |
print '%s %s' % (alignment.p_token, alignment.h_token) | |
print 'POS: %s' % alignment.get_p_wn_tag() | |
pSynsets = [] | |
if alignment.p_wn_tag != 'SKIP': | |
pSynsets = wn.synsets(alignment.p_token, pos=alignment.p_wn_tag) | |
print 'here' | |
hSynsets = [] | |
if alignment.h_wn_tag != 'SKIP': | |
hSynsets = wn.synsets(alignment.h_token, pos=alignment.h_wn_tag) | |
features = np.zeros(16, dtype=float) | |
features[0] = self.getWNSyn(alignment, pSynsets, hSynsets) | |
features[1] = 0 # TODO TODO | |
features[2] = self.getWNHyper(pSynsets, hSynsets) | |
features[3] = self.getWNHypo(pSynsets, hSynsets) | |
features[4] = self.getJiCo(pSynsets, hSynsets) | |
features[5] = self.getDLin(pSynsets, hSynsets) | |
features[6] = self.getLemSubSeqF(alignment) | |
features[7] = self.getLemSubSeqR(alignment) | |
features[8] = self.getLemSubSeqE(alignment) | |
features[9] = self.getLemSubSeqN(alignment) | |
features[10] = self.getLight(alignment) | |
features[11] = self.getPreps(alignment) | |
features[12] = self.getPronoun(alignment) | |
features[13] = self.getLemStrSim(alignment) | |
features[14] = self.getNNNN(alignment) | |
features[15] = self.getNomB(alignment) | |
return features | |
if __name__ == '__main__': | |
edit1 = Alignment_sub.Alignment_sub('happy', 'JJ', 'sad', 'JJ') | |
p_synsets = wn.synsets(edit1.p_token, pos=edit1.p_wn_tag) | |
h_synsets = wn.synsets(edit1.h_token, pos=edit1.h_wn_tag) | |
featurizer = Lexent_featurizer_sub() | |
ant_feature = featurizer.getWNAnt(edit1, p_synsets, h_synsets) | |
print 'Ant: %s' % ant_feature |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment