Skip to content

Instantly share code, notes, and snippets.

@prhbrt
Created October 25, 2017 20:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save prhbrt/5dd56ddf5aab482026cafa9c09dfd00f to your computer and use it in GitHub Desktop.
Save prhbrt/5dd56ddf5aab482026cafa9c09dfd00f to your computer and use it in GitHub Desktop.
import numpy
from gensim.models.keyedvectors import Vocab, KeyedVectors
# Covert the Twitter datasets found here to gensim word2vec format: https://github.com/3Top/word2vec-api
with open('glove.6B.300d.txt') as fin:
word_vecs = {
word: numpy.array(list(map(float, vec.split())))
for line in fin
for word, vec in [line.split(' ', 1)]
}
result = KeyedVectors()
result.vector_size = matrix.shape[1]
result.index2word = word_vecs.keys()
result.syn0 = numpy.concatenate([word_vecs[k][None, :] for k in result.index2word], axis=0)
del word_vecs
for word_id, word in enumerate(result.index2word):
result.vocab[word] = Vocab(index=word_id, count=matrix.shape[0] - word_id)
result.save_word2vec_format('glove.6B.300d.bin', binary=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment