Skip to content

Instantly share code, notes, and snippets.

@gavinmh
Created December 4, 2012 01:59
Show Gist options
  • Save gavinmh/4199851 to your computer and use it in GitHub Desktop.
Save gavinmh/4199851 to your computer and use it in GitHub Desktop.
Lexical entailment featurizer for substitution edits
from __future__ import division
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk.metrics import edit_distance
from nltk.corpus.reader.wordnet import WordNetError
import numpy as np
import logging, os
import Alignment_sub
class Lexent_featurizer_sub:
def __init__(self):
logging.basicConfig(level=logging.DEBUG)
self.brown_ic = wordnet_ic.ic('ic-brown.dat')
self.semcor_ic = wordnet_ic.ic('ic-semcor.dat')
filename = os.path.join(os.path.dirname(__file__), 'resources/verb_nom_tuples.txt')
with open(filename) as f:
self.nom_adj_verb_tuples = f.readlines()
self.stoplist = []
stoplistFile = os.path.join(os.path.dirname(__file__), 'resources/stoplist.txt')
with open(stoplistFile) as f:
self.stoplist = f.read().splitlines()
self.prepositions = []
prepositionsFile = os.path.join(os.path.dirname(__file__), 'resources/prepositions.txt')
with open(prepositionsFile) as f:
self.prepositions = f.read().splitlines()
self.pronouns = []
pronounsFile = os.path.join(os.path.dirname(__file__), 'resources/pronouns.txt')
with open(pronounsFile) as f:
self.pronouns = f.read().splitlines()
def getNomB(self, alignment):
h = alignment.h_token
p = alignment.p_token
pair1 = h + ',' + p + '\n'
pair2 = p + ',' + h + '\n'
if pair1 in self.nom_adj_verb_tuples:
return 0.75
elif pair2 in self.nom_adj_verb_tuples:
return 0.75
return 0
def getNNNN(self, alignment):
noun_types= ['NN','NNS']
proper_noun_types = ['NNP', 'NNPS']
h_misses = [tag for tag in alignment.h_penn_tag if tag not in noun_types]
p_misses = [tag for tag in alignment.p_penn_tag if tag not in noun_types]
if len(h_misses) == 0 and len(p_misses) == 0:
return 1
h_misses = [tag for tag in alignment.h_penn_tag if tag not in proper_noun_types]
p_misses = [tag for tag in alignment.p_penn_tag if tag not in proper_noun_types]
if len(h_misses) == 0 and len(p_misses) == 0:
return 1
return 0
def getPronoun(self, alignment):
if alignment.h_token in self.pronouns and alignment.p_token in self.pronouns:
return 1
return 0
def getLemStrSim(self, alignment):
p = alignment.p_token
h = alignment.h_token
distance = edit_distance(h,p)
max_length = max(len(h),len(p))
score = 1 - (distance / (max_length - 2.000000001))
return max(0, score)
def getLight(self, alignment):
if alignment.h_token in self.stoplist and alignment.p_token in self.stoplist:
return 1
return 0
def getPreps(self, alignment):
if alignment.h_token in self.prepositions and alignment.p_token in self.prepositions:
return 1
return 0
def contains(self, small, big):
for i in xrange(len(big)-len(small)+1):
for j in xrange(len(small)):
if big[i+j] != small[j]:
break
else:
#return i, i+len(small)
return True
return False
def phrase_contains(self, p,h):
if self.contains(p,h) or self.contains(h,p):
return True
return False
# TODO lemmas should be passed to the following
# TODO or should they lemmatize tokens?
def getLemSubSeqF(self, alignment):
p = alignment.p_token
h = alignment.h_token
if h in p and p != h:
return 1
return 0
def getLemSubSeqR(self, alignment):
p = alignment.p_token
h = alignment.h_token
if p in h and p != h:
return 1
return 0
def getLemSubSeqE(self, alignment):
if alignment.h_token == alignment.p_token:
return 1
return 0
def getLemSubSeqN(self, alignment):
p = alignment.p_token
h = alignment.h_token
if p != h \
and p not in h \
and h not in p:
return 1
return 0
def getDLin(self, pSynsets, hSynsets):
scores = [0]
for p_synset in pSynsets:
for h_synset in hSynsets:
try:
brown_score = p_synset.lin_similarity(h_synset, self.brown_ic)
scores.append(min(brown_score,1))
logging.info('DLin: %s. %s: %s' % (p_synset, h_synset, brown_score))
except WordNetError:
pass
try:
semcor_score = p_synset.lin_similarity(h_synset, self.semcor_ic)
scores.append(min(semcor_score,1))
logging.info('DLin: %s. %s: %s' % (p_synset, h_synset, semcor_score))
except WordNetError:
pass
return max(scores)
# TODO if both synsets contain same synset, should that score be counted?
def getJiCo(self, pSynsets, hSynsets):
scores = [0]
for p_synset in pSynsets:
for h_synset in hSynsets:
try:
brown_score = p_synset.jcn_similarity(h_synset, self.brown_ic)
scores.append(min(brown_score,1))
logging.info('Brown score: %s, %s: %s' % (p_synset, h_synset, brown_score))
except WordNetError:
pass
try:
semcor_score = p_synset.jcn_similarity(h_synset, self.semcor_ic)
scores.append(min(semcor_score,1))
logging.info('Semcor score: %s, %s: %s' % (p_synset, h_synset, semcor_score))
except WordNetError:
pass
return max(scores)
def getWNHyper(self, pSynsets, hSynsets):
path_distances = []
for p_synset in pSynsets:
logging.info('p synset is ' + str(p_synset))
p_hypernyms = p_synset.hypernym_distances()
for h_synset in hSynsets:
logging.info('h synset is ' + str(h_synset))
if h_synset in [synset_dist_tuple[0] for synset_dist_tuple in p_hypernyms]:
for synset in p_hypernyms:
if synset[0] == h_synset:
logging.info('Found h as hypernym of p: ' + str(synset))
if synset[1] != 0:
path_distances.append(synset[1])
if len(path_distances) > 0:
logging.info(path_distances)
shortest_path = min(path_distances)
score = 1 - (shortest_path / 8)
return score
else:
return 0
def getWNHypo(self, pSynsets, hSynsets):
path_distances = []
for h_synset in hSynsets:
logging.info('h synset is ' + str(h_synset))
h_hypernyms = h_synset.hypernym_distances()
for p_synset in pSynsets:
logging.info('p synset is ' + str(p_synset))
if p_synset in [synset_dist_tuple[0] for synset_dist_tuple in h_hypernyms]:
for synset in h_hypernyms:
if synset[0] == p_synset:
logging.info('Found p as hypernym of h: ' + str(synset))
if synset[1] != 0:
path_distances.append(synset[1])
if len(path_distances) > 0:
logging.info(path_distances)
shortest_path = min(path_distances)
score = 1 - (shortest_path / 8)
return score
else:
return 0
def getWNAnt(self, alignment, p_synsets, h_synsets):
# Antonyms of h
h_antonym_synsets = [l.antonyms() for s in h_synsets for l in s.lemmas]
h_antonyms = []
for lemma_list in h_antonym_synsets:
for lemma in lemma_list:
h_antonyms += [name for name in lemma.synset.lemma_names]
print '\nH: %s\nants:\n%s' % (h_synsets, h_antonyms)
# Antonyms of p
p_antonym_synsets = [l.antonyms() for s in p_synsets for l in s.lemmas]
p_antonyms = []
for lemma_list in p_antonym_synsets:
for lemma in lemma_list:
p_antonyms += [name for name in lemma.synset.lemma_names]
print '\nP: %s\nants:\n%s' % (p_synsets, p_antonyms)
# Synonyms of p
p_lemmas = []
for synset in p_synsets:
p_lemmas += synset.lemma_names
print '\nP synonyms:\n%s' % p_lemmas
# Synonyms of h
h_lemmas = []
for synset in h_synsets:
h_lemmas += synset.lemma_names
print '\nH synonyms:\n%s' % h_lemmas
for p_synonym in p_lemmas:
if p_synonym in h_antonyms:
print 'ANTONYM: %s' % (p_synonym)
return 1
for h_synonym in h_lemmas:
if h_synonym in p_antonyms:
print 'ANTONYM: %s' % (h_synonym)
return 1
return 0
# return 1 if p and h are synonyms
def getWNSyn(self, alignment, pSynsets, hSynsets):
h_synonyms = [l.name for s in hSynsets for l in s.lemmas]
p_synonyms = [l.name for s in pSynsets for l in s.lemmas]
logging.info(h_synonyms)
logging.info(p_synonyms)
if alignment.h_token in p_synonyms:
logging.info('h is a synonym of p')
return 1
elif alignment.p_token in h_synonyms:
logging.info('p is a synonym of h')
return 1
else:
logging.info('p and h are not synonyms')
return 0
def getFeatures(self, alignment):
print '%s %s' % (alignment.p_token, alignment.h_token)
print 'POS: %s' % alignment.get_p_wn_tag()
pSynsets = []
if alignment.p_wn_tag != 'SKIP':
pSynsets = wn.synsets(alignment.p_token, pos=alignment.p_wn_tag)
print 'here'
hSynsets = []
if alignment.h_wn_tag != 'SKIP':
hSynsets = wn.synsets(alignment.h_token, pos=alignment.h_wn_tag)
features = np.zeros(16, dtype=float)
features[0] = self.getWNSyn(alignment, pSynsets, hSynsets)
features[1] = 0 # TODO TODO
features[2] = self.getWNHyper(pSynsets, hSynsets)
features[3] = self.getWNHypo(pSynsets, hSynsets)
features[4] = self.getJiCo(pSynsets, hSynsets)
features[5] = self.getDLin(pSynsets, hSynsets)
features[6] = self.getLemSubSeqF(alignment)
features[7] = self.getLemSubSeqR(alignment)
features[8] = self.getLemSubSeqE(alignment)
features[9] = self.getLemSubSeqN(alignment)
features[10] = self.getLight(alignment)
features[11] = self.getPreps(alignment)
features[12] = self.getPronoun(alignment)
features[13] = self.getLemStrSim(alignment)
features[14] = self.getNNNN(alignment)
features[15] = self.getNomB(alignment)
return features
if __name__ == '__main__':
edit1 = Alignment_sub.Alignment_sub('happy', 'JJ', 'sad', 'JJ')
p_synsets = wn.synsets(edit1.p_token, pos=edit1.p_wn_tag)
h_synsets = wn.synsets(edit1.h_token, pos=edit1.h_wn_tag)
featurizer = Lexent_featurizer_sub()
ant_feature = featurizer.getWNAnt(edit1, p_synsets, h_synsets)
print 'Ant: %s' % ant_feature
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment