BrikerMan/w2v_visualizer-v2.py

## w2v_visualizer-v2.py
from gensim.models import KeyedVectors

# Load gensim word2vec
w2v_path = '<Gensim File Path>'
w2v = KeyedVectors.load_word2vec_format(w2v_path)

import io

# Vector file, `\t` seperated the vectors and `\n` seperate the words
"""
0.1\t0.2\t0.5\t0.9
0.2\t0.1\t5.0\t0.2
0.4\t0.1\t7.0\t0.8
"""
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')

# Meta data file, `\n` seperated word
"""
token1
token2
token3
"""
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

# Write meta file and vector file
for index in range(len(w2v.index2word)):
    word = w2v.index2word[index]
    vec = w2v.vectors[index]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_v.close()
out_m.close()

# Then we can visuale using the `http://projector.tensorflow.org/` to visualize those two files.

# 1. Open the Embedding Projector.
# 2. Click on "Load data".
# 3. Upload the two files we created above: vecs.tsv and meta.tsv.

## w2v_visualizer.py
# encoding: utf-8
"""
@author: BrikerMan
@contact: eliyar917@gmail.com
@blog: https://eliyar.biz
@version: 1.0
@license: Apache Licence
@file: w2v_visualizer.py
@time: 2017/7/30 上午9:37
"""
import sys
import os
import pathlib
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector


def visualize(model, output_path):
    meta_file = "w2x_metadata.tsv"
    placeholder = np.zeros((len(model.wv.index2word), model.vector_size))

    with open(os.path.join(output_path, meta_file), 'wb') as file_metadata:
        for i, word in enumerate(model.wv.index2word):
            placeholder[i] = model[word]
            # temporary solution for https://github.com/tensorflow/tensorflow/issues/9094
            if word == '':
                print("Emply Line, should replecaed by any thing else, or will cause a bug of tensorboard")
                file_metadata.write("{0}".format('<Empty Line>').encode('utf-8') + b'\n')
            else:
                file_metadata.write("{0}".format(word).encode('utf-8') + b'\n')

    # define the model without training
    sess = tf.InteractiveSession()

    embedding = tf.Variable(placeholder, trainable=False, name='w2x_metadata')
    tf.global_variables_initializer().run()

    saver = tf.train.Saver()
    writer = tf.summary.FileWriter(output_path, sess.graph)

    # adding into projector
    config = projector.ProjectorConfig()
    embed = config.embeddings.add()
    embed.tensor_name = 'w2x_metadata'
    embed.metadata_path = meta_file

    # Specify the width and height of a single thumbnail.
    projector.visualize_embeddings(writer, config)
    saver.save(sess, os.path.join(output_path, 'w2x_metadata.ckpt'))
    print('Run `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path))


if __name__ == "__main__":
    """
    Use model.save_word2vec_format to save w2v_model as word2evc format
    Then just run `python w2v_visualizer.py word2vec.text visualize_result`
    """
    try:
        model_path = sys.argv[1]
        output_path = sys.argv[2]
    except:
        print("Please provice model path and output path")
    model = KeyedVectors.load_word2vec_format(model_path)
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
    visualize(model, output_path)
	from gensim.models import KeyedVectors

	# Load gensim word2vec
	w2v_path = '<Gensim File Path>'
	w2v = KeyedVectors.load_word2vec_format(w2v_path)

	import io

	# Vector file, `\t` seperated the vectors and `\n` seperate the words
	"""
	0.1\t0.2\t0.5\t0.9
	0.2\t0.1\t5.0\t0.2
	0.4\t0.1\t7.0\t0.8
	"""
	out_v = io.open('vecs.tsv', 'w', encoding='utf-8')

	# Meta data file, `\n` seperated word
	"""
	token1
	token2
	token3
	"""
	out_m = io.open('meta.tsv', 'w', encoding='utf-8')

	# Write meta file and vector file
	for index in range(len(w2v.index2word)):
	word = w2v.index2word[index]
	vec = w2v.vectors[index]
	out_m.write(word + "\n")
	out_v.write('\t'.join([str(x) for x in vec]) + "\n")
	out_v.close()
	out_m.close()

	# Then we can visuale using the `http://projector.tensorflow.org/` to visualize those two files.

	# 1. Open the Embedding Projector.
	# 2. Click on "Load data".
	# 3. Upload the two files we created above: vecs.tsv and meta.tsv.
	# encoding: utf-8
	"""
	@author: BrikerMan
	@contact: eliyar917@gmail.com
	@blog: https://eliyar.biz
	@version: 1.0
	@license: Apache Licence
	@file: w2v_visualizer.py
	@time: 2017/7/30 上午9:37
	"""
	import sys
	import os
	import pathlib
	import numpy as np
	from gensim.models.keyedvectors import KeyedVectors
	import tensorflow as tf
	from tensorflow.contrib.tensorboard.plugins import projector


	def visualize(model, output_path):
	meta_file = "w2x_metadata.tsv"
	placeholder = np.zeros((len(model.wv.index2word), model.vector_size))

	with open(os.path.join(output_path, meta_file), 'wb') as file_metadata:
	for i, word in enumerate(model.wv.index2word):
	placeholder[i] = model[word]
	# temporary solution for https://github.com/tensorflow/tensorflow/issues/9094
	if word == '':
	print("Emply Line, should replecaed by any thing else, or will cause a bug of tensorboard")
	file_metadata.write("{0}".format('<Empty Line>').encode('utf-8') + b'\n')
	else:
	file_metadata.write("{0}".format(word).encode('utf-8') + b'\n')

	# define the model without training
	sess = tf.InteractiveSession()

	embedding = tf.Variable(placeholder, trainable=False, name='w2x_metadata')
	tf.global_variables_initializer().run()

	saver = tf.train.Saver()
	writer = tf.summary.FileWriter(output_path, sess.graph)

	# adding into projector
	config = projector.ProjectorConfig()
	embed = config.embeddings.add()
	embed.tensor_name = 'w2x_metadata'
	embed.metadata_path = meta_file

	# Specify the width and height of a single thumbnail.
	projector.visualize_embeddings(writer, config)
	saver.save(sess, os.path.join(output_path, 'w2x_metadata.ckpt'))
	print('Run `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path))


	if __name__ == "__main__":
	"""
	Use model.save_word2vec_format to save w2v_model as word2evc format
	Then just run `python w2v_visualizer.py word2vec.text visualize_result`
	"""
	try:
	model_path = sys.argv[1]
	output_path = sys.argv[2]
	except:
	print("Please provice model path and output path")
	model = KeyedVectors.load_word2vec_format(model_path)
	pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
	visualize(model, output_path)