patrickdrouin/gensim_word2vec_make_semantic_network.py

## gensim_word2vec_make_semantic_network.py
"""
Code to make a network out of the shortest N cosine-distances (or, equivalently, the strongest N associations)
between a set of words in a gensim word2vec model.

To use:
Set the filenames for the word2vec model.
Set `my_words` to be a list of your own choosing.
Set `num_top_dists` to be a number or a factor of the length of `my_words.`
Choose between the two methods below to produce distances, and comment-out the other one.
"""

# Import gensim and load the model
import gensim
model = gensim.models.Word2Vec.load_word2vec_format('[model].txt.gz', '[vocab].txt')

# Set the the words we want to find connections between
my_words = ['a','b', ...]
my_words = [word for word in my_words if word in model] # filter out words not in model

# The number of connections we want: either as a factor of the number of words or a set number
num_top_conns = len(my_words) * 2

#######

# Make a list of all word-to-word distances [each as a tuple of (word1,word2,dist)]
dists=[]

## Method 1 to find distances: use gensim to get the similarity between each word pair
for i1,word1 in enumerate(my_words):
	for i2,word2 in enumerate(my_words):
		if i1>=i2: continue
		cosine_similarity = model.similarity(word1,word2)
		cosine_distance = 1 - cosine_similarity
		dist = (word1, word2, cosine_distance)
		dists.append(dist)

## Or, Method 2 to find distances: use scipy (faster)
from scipy.spatial.distance import pdist,squareform
Matrix = np.array([model[word] for word in my_words])
dist = squareform(pdist(Matrix,'cosine'))
for i1,word1 in enumerate(my_words):
	for i2,word2 in enumerate(my_words):
		if i1>=i2: continue
		cosine_distance = Matrix[i1, i2]
		dist = (word1, word2, cosine_distance)
		dists.append(dist)

######

# Sort the list by ascending distance
dists.sort(key=lambda _tuple: _tuple[-1])

# Get the top connections
top_conns = dists[:num_top_conns]

# Make a network
import networkx as nx
g = nx.Graph()
for word1,word2,dist in top_conns:
	weight = 1 - dist # cosine similarity makes more sense for edge weight
	g.add_edge(word1, word2, weight=float(weight))

# Write the network
nx.write_graphml(g, 'my-semantic-network.graphml')
	"""
	Code to make a network out of the shortest N cosine-distances (or, equivalently, the strongest N associations)
	between a set of words in a gensim word2vec model.

	To use:
	Set the filenames for the word2vec model.
	Set `my_words` to be a list of your own choosing.
	Set `num_top_dists` to be a number or a factor of the length of `my_words.`
	Choose between the two methods below to produce distances, and comment-out the other one.
	"""

	# Import gensim and load the model
	import gensim
	model = gensim.models.Word2Vec.load_word2vec_format('[model].txt.gz', '[vocab].txt')

	# Set the the words we want to find connections between
	my_words = ['a','b', ...]
	my_words = [word for word in my_words if word in model] # filter out words not in model

	# The number of connections we want: either as a factor of the number of words or a set number
	num_top_conns = len(my_words) * 2

	#######

	# Make a list of all word-to-word distances [each as a tuple of (word1,word2,dist)]
	dists=[]

	## Method 1 to find distances: use gensim to get the similarity between each word pair
	for i1,word1 in enumerate(my_words):
	for i2,word2 in enumerate(my_words):
	if i1>=i2: continue
	cosine_similarity = model.similarity(word1,word2)
	cosine_distance = 1 - cosine_similarity
	dist = (word1, word2, cosine_distance)
	dists.append(dist)

	## Or, Method 2 to find distances: use scipy (faster)
	from scipy.spatial.distance import pdist,squareform
	Matrix = np.array([model[word] for word in my_words])
	dist = squareform(pdist(Matrix,'cosine'))
	for i1,word1 in enumerate(my_words):
	for i2,word2 in enumerate(my_words):
	if i1>=i2: continue
	cosine_distance = Matrix[i1, i2]
	dist = (word1, word2, cosine_distance)
	dists.append(dist)

	######

	# Sort the list by ascending distance
	dists.sort(key=lambda _tuple: _tuple[-1])

	# Get the top connections
	top_conns = dists[:num_top_conns]

	# Make a network
	import networkx as nx
	g = nx.Graph()
	for word1,word2,dist in top_conns:
	weight = 1 - dist # cosine similarity makes more sense for edge weight
	g.add_edge(word1, word2, weight=float(weight))

	# Write the network
	nx.write_graphml(g, 'my-semantic-network.graphml')