bbzzzz/Word Similarity

## Word Similarity
import nltk
from nltk.corpus import wordnet as wn

### Synsets and lemmas
# For an arbitrary word, i.e. dog, it may have different senses, and we can find its synsets.
wn.synsets('dog')

# Once you have a synset, there are functions to find the information on that synset,
# and we will start with “lemma_names”, “lemmas”, “definitions” and “examples”.
# For the first synset 'dog.n.01', which means the first noun sense of ‘dog’, we can first find all of its words/lemma names.
# These are all the words that are synonyms of this sense of ‘dog’.
wn.synset('dog.n.01').lemma_names()

# Given a synset, find all its lemmas, where a lemma is the pairing of a word with a synset.
wn.synset('dog.n.01').lemmas()

# Given a lemma, find its synset
wn.lemma('dog.n.01.domestic_dog').synset()

# Given a word, find lemmas contained in all synsets it belongs to
for synset in wn.synsets('dog'):
  print synset, ": ", synset.lemma_names()

# Given a word, find all lemmas involving the word. Note that these are the synsets of the word ‘dog’,
# but just also showing that ‘dog’ is one of the words in each of the synsets.
wn.lemmas('dog')

#--------------------------------------------------------------------------------------------
### Definitions and examples:
# The other functions of synsets give the additional information of definitions and examples.
# Find definitions of the synset for the first sense of the word ‘dog’:
wn.synset('dog.n.01').definition()
wn.synset('dog.n.01').examples()

# Or we can show all the synsets and their definitions:
for synset in wn.synsets('dog'):
  print synset, ": ", synset.definition()

#--------------------------------------------------------------------------------------------
### The WordNet Hierarchy
# WordNet contains many relations between synsets.
# In particular, we quite often explore the hierarchy of WordNet synsets induced by the hypernym and hyponym relations.
# (These relations are sometimes called “is-a” because they represent abstract levels of what things are.)

# Find hypernyms of a synset of ‘dog’:
dog1 = wn.synset('dog.n.01')
dog1.hypernyms()

# Find hyponyms:
dog1.hyponyms()

# We can find the most general hypernym as the root hypernym:
dog1.root_hypernyms()

# The function hypernym_paths shows paths between the top of the hierarchy down to the synset.
# In this example, there are two paths between entity and the first sense of dog.
pathsdog=dog1.hypernym_paths()
print len(pathsdog)
[synset.name() for synset in pathsdog[0]]
[synset.name() for synset in pathsdog[1]]

# The min_depth function tells how many edges there are between a word and the top of the hierarchy.
dog1.min_depth()

#--------------------------------------------------------------------------------------------
### Word Similarity
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')

hit = wn.synset('hit.v.01')
slap = wn.synset('slap.v.01')

# One way to find semantic similarity is to find the hypernyms of two synsets.
dog.lowest_common_hypernyms(cat)
pathscat=cat.hypernym_paths()
[synset.name() for synset in pathscat[0]]

# synset1.path_similarity(synset2): Return a score denoting how similar two word senses are,
# based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy.
# The score is in the range 0 to 1. A score of 1 represents identity i.e. comparing a sense with itself will return 1.
dog.path_similarity(cat)
dog.path_similarity(dog)
hit.path_similarity(slap)
wn.path_similarity(hit, slap)

# wordnet_ic Information Content: Load an information content file from the wordnet_ic corpus.
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

# Resnik Similarity: Return a score denoting how similar two word senses are,
# based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node).

# Note that for any similarity measure that uses information content,
# the result is dependent on the corpus used to generate the information content and
# the specifics of how the information content was created.
dog.res_similarity(cat, brown_ic)
dog.res_similarity(cat, semcor_ic)

# Jiang-Conrath Similarity Return a score denoting how similar two word senses are,
# based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and
# that of the two input Synsets. The relationship is given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
dog.jcn_similarity(cat, brown_ic)
dog.jcn_similarity(cat, semcor_ic)

# Lin Similarity: Return a score denoting how similar two word senses are,
# based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and
# that of the two input Synsets. The relationship is given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
dog.lin_similarity(cat, brown_ic)
dog.lin_similarity(cat, semcor_ic)
	import nltk
	from nltk.corpus import wordnet as wn

	### Synsets and lemmas
	# For an arbitrary word, i.e. dog, it may have different senses, and we can find its synsets.
	wn.synsets('dog')

	# Once you have a synset, there are functions to find the information on that synset,
	# and we will start with “lemma_names”, “lemmas”, “definitions” and “examples”.
	# For the first synset 'dog.n.01', which means the first noun sense of ‘dog’, we can first find all of its words/lemma names.
	# These are all the words that are synonyms of this sense of ‘dog’.
	wn.synset('dog.n.01').lemma_names()

	# Given a synset, find all its lemmas, where a lemma is the pairing of a word with a synset.
	wn.synset('dog.n.01').lemmas()

	# Given a lemma, find its synset
	wn.lemma('dog.n.01.domestic_dog').synset()

	# Given a word, find lemmas contained in all synsets it belongs to
	for synset in wn.synsets('dog'):
	print synset, ": ", synset.lemma_names()

	# Given a word, find all lemmas involving the word. Note that these are the synsets of the word ‘dog’,
	# but just also showing that ‘dog’ is one of the words in each of the synsets.
	wn.lemmas('dog')

	#--------------------------------------------------------------------------------------------
	### Definitions and examples:
	# The other functions of synsets give the additional information of definitions and examples.
	# Find definitions of the synset for the first sense of the word ‘dog’:
	wn.synset('dog.n.01').definition()
	wn.synset('dog.n.01').examples()

	# Or we can show all the synsets and their definitions:
	for synset in wn.synsets('dog'):
	print synset, ": ", synset.definition()

	#--------------------------------------------------------------------------------------------
	### The WordNet Hierarchy
	# WordNet contains many relations between synsets.
	# In particular, we quite often explore the hierarchy of WordNet synsets induced by the hypernym and hyponym relations.
	# (These relations are sometimes called “is-a” because they represent abstract levels of what things are.)

	# Find hypernyms of a synset of ‘dog’:
	dog1 = wn.synset('dog.n.01')
	dog1.hypernyms()

	# Find hyponyms:
	dog1.hyponyms()

	# We can find the most general hypernym as the root hypernym:
	dog1.root_hypernyms()

	# The function hypernym_paths shows paths between the top of the hierarchy down to the synset.
	# In this example, there are two paths between entity and the first sense of dog.
	pathsdog=dog1.hypernym_paths()
	print len(pathsdog)
	[synset.name() for synset in pathsdog[0]]
	[synset.name() for synset in pathsdog[1]]

	# The min_depth function tells how many edges there are between a word and the top of the hierarchy.
	dog1.min_depth()

	#--------------------------------------------------------------------------------------------
	### Word Similarity
	dog = wn.synset('dog.n.01')
	cat = wn.synset('cat.n.01')

	hit = wn.synset('hit.v.01')
	slap = wn.synset('slap.v.01')

	# One way to find semantic similarity is to find the hypernyms of two synsets.
	dog.lowest_common_hypernyms(cat)
	pathscat=cat.hypernym_paths()
	[synset.name() for synset in pathscat[0]]

	# synset1.path_similarity(synset2): Return a score denoting how similar two word senses are,
	# based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy.
	# The score is in the range 0 to 1. A score of 1 represents identity i.e. comparing a sense with itself will return 1.
	dog.path_similarity(cat)
	dog.path_similarity(dog)
	hit.path_similarity(slap)
	wn.path_similarity(hit, slap)

	# wordnet_ic Information Content: Load an information content file from the wordnet_ic corpus.
	from nltk.corpus import wordnet_ic
	brown_ic = wordnet_ic.ic('ic-brown.dat')
	semcor_ic = wordnet_ic.ic('ic-semcor.dat')

	# Resnik Similarity: Return a score denoting how similar two word senses are,
	# based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node).

	# Note that for any similarity measure that uses information content,
	# the result is dependent on the corpus used to generate the information content and
	# the specifics of how the information content was created.
	dog.res_similarity(cat, brown_ic)
	dog.res_similarity(cat, semcor_ic)

	# Jiang-Conrath Similarity Return a score denoting how similar two word senses are,
	# based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and
	# that of the two input Synsets. The relationship is given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
	dog.jcn_similarity(cat, brown_ic)
	dog.jcn_similarity(cat, semcor_ic)

	# Lin Similarity: Return a score denoting how similar two word senses are,
	# based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and
	# that of the two input Synsets. The relationship is given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
	dog.lin_similarity(cat, brown_ic)
	dog.lin_similarity(cat, semcor_ic)