ottokart/word2vec-binary-to-text.py

## word2vec-binary-to-text.py
# coding: utf-8
from __future__ import division

import struct
import sys
import gzip

FILE_NAME = "GoogleNews-vectors-negative300.bin.gz" # outputs GoogleNews-vectors-negative300.bin.gz.txt
MAX_VECTORS = 100000 # Top words to take
FLOAT_SIZE = 4 # 32bit float

output_file_name = FILE_NAME + ".txt"

with gzip.open(FILE_NAME, 'rb') as f, open(output_file_name, 'w') as f_out:

    c = None

    # read the header
    header = ""
    while c != "\n":
        c = f.read(1)
        header += c

    total_num_vectors, vector_len = (int(x) for x in header.split())
    num_vectors = min(MAX_VECTORS, total_num_vectors)

    print "Taking embeddings of top %d words (out of %d total)" % (num_vectors, total_num_vectors)
    print "Embedding size: %d" % vector_len

    for j in xrange(num_vectors):

        word = ""
        while True:
            c = f.read(1)
            if c == " ":
                break
            word += c

        binary_vector = f.read(FLOAT_SIZE * vector_len)
        txt_vector = [ "%s" % struct.unpack_from('f', binary_vector, i)[0]
                   for i in xrange(0, len(binary_vector), FLOAT_SIZE) ]

        f_out.write("%s %s\n" % (word, " ".join(txt_vector)))

        sys.stdout.write("%d%%\r" % ((j + 1) / num_vectors * 100))
        sys.stdout.flush()

        if (j + 1) == num_vectors:
            break

print "\nDONE!"
print "Output written to %s" % output_file_name
	# coding: utf-8
	from __future__ import division

	import struct
	import sys
	import gzip

	FILE_NAME = "GoogleNews-vectors-negative300.bin.gz" # outputs GoogleNews-vectors-negative300.bin.gz.txt
	MAX_VECTORS = 100000 # Top words to take
	FLOAT_SIZE = 4 # 32bit float

	output_file_name = FILE_NAME + ".txt"

	with gzip.open(FILE_NAME, 'rb') as f, open(output_file_name, 'w') as f_out:

	c = None

	# read the header
	header = ""
	while c != "\n":
	c = f.read(1)
	header += c

	total_num_vectors, vector_len = (int(x) for x in header.split())
	num_vectors = min(MAX_VECTORS, total_num_vectors)

	print "Taking embeddings of top %d words (out of %d total)" % (num_vectors, total_num_vectors)
	print "Embedding size: %d" % vector_len

	for j in xrange(num_vectors):

	word = ""
	while True:
	c = f.read(1)
	if c == " ":
	break
	word += c

	binary_vector = f.read(FLOAT_SIZE * vector_len)
	txt_vector = [ "%s" % struct.unpack_from('f', binary_vector, i)[0]
	for i in xrange(0, len(binary_vector), FLOAT_SIZE) ]

	f_out.write("%s %s\n" % (word, " ".join(txt_vector)))

	sys.stdout.write("%d%%\r" % ((j + 1) / num_vectors * 100))
	sys.stdout.flush()

	if (j + 1) == num_vectors:
	break

	print "\nDONE!"
	print "Output written to %s" % output_file_name