Skip to content

Instantly share code, notes, and snippets.

@ottokart
Last active February 1, 2021 17:25
Show Gist options
  • Save ottokart/4031dfb471ad5c11d97ad72cbc01b934 to your computer and use it in GitHub Desktop.
Save ottokart/4031dfb471ad5c11d97ad72cbc01b934 to your computer and use it in GitHub Desktop.
Python script to convert word2vec pre-trained word embeddings from a binary format into a text format where each line starts with a word followed by corresponding embedding vector entries separated by spaces. E.g., "dog 0.41231234567890 0.355122341578123 ..."
# coding: utf-8
from __future__ import division
import struct
import sys
import gzip
FILE_NAME = "GoogleNews-vectors-negative300.bin.gz" # outputs GoogleNews-vectors-negative300.bin.gz.txt
MAX_VECTORS = 100000 # Top words to take
FLOAT_SIZE = 4 # 32bit float
output_file_name = FILE_NAME + ".txt"
with gzip.open(FILE_NAME, 'rb') as f, open(output_file_name, 'w') as f_out:
c = None
# read the header
header = ""
while c != "\n":
c = f.read(1)
header += c
total_num_vectors, vector_len = (int(x) for x in header.split())
num_vectors = min(MAX_VECTORS, total_num_vectors)
print "Taking embeddings of top %d words (out of %d total)" % (num_vectors, total_num_vectors)
print "Embedding size: %d" % vector_len
for j in xrange(num_vectors):
word = ""
while True:
c = f.read(1)
if c == " ":
break
word += c
binary_vector = f.read(FLOAT_SIZE * vector_len)
txt_vector = [ "%s" % struct.unpack_from('f', binary_vector, i)[0]
for i in xrange(0, len(binary_vector), FLOAT_SIZE) ]
f_out.write("%s %s\n" % (word, " ".join(txt_vector)))
sys.stdout.write("%d%%\r" % ((j + 1) / num_vectors * 100))
sys.stdout.flush()
if (j + 1) == num_vectors:
break
print "\nDONE!"
print "Output written to %s" % output_file_name
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment