Skip to content

Instantly share code, notes, and snippets.

@shubham0204
Last active June 23, 2020 09:56
Show Gist options
  • Save shubham0204/531f75c37c49362c9b61a11865496177 to your computer and use it in GitHub Desktop.
Save shubham0204/531f75c37c49362c9b61a11865496177 to your computer and use it in GitHub Desktop.
import numpy as np
import pickle
import os
# You need to download the GloVe embeddings ( in txt files ) from here -> http://nlp.stanford.edu/data/glove.6B.zip
# The vocabulary size. It is the value of `len( tokenizer.word_index )+1`
vocab_size = int(input('Enter Vocabulary Size : '))
# The file to the GloVe file. For instance, "glove.6B/glove.6B.50d" where "50" represents the output dimension of the embedding.
glove_path = input('Enter path to GloVe text file : ')
# The filepath of the tf.keras.preprocessing.text.Tokenizer object in pickled form.
# This object was earlier used to tokenize the question and answer pairs.
tokenizer_path = input('Enter pickled Tokenizer path : ')
# The output directory where the files will be stored.
output_dir = input( 'Enter path of output directory : ')
# The output dimensions for the embedding. If the `glove_path` refers to `glove.6B.50d` then this argument should be equal to 50.
output_dim = int(input('Enter output dimension for Embedding : '))
# The filename for the saved numpy array. For example, "embedding_matrix.npy"
embedding_filename = input('Enter file name : ' )
# load the whole embedding into memory
embeddings_index = dict()
f = open( glove_path , encoding='utf8' )
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
tokenizer = pickle.load( open( tokenizer_path , 'rb' ) )
embedding_matrix = np.zeros((vocab_size, output_dim))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
np.save( os.path.join( output_dir , '{}'.format( embedding_filename )) , embedding_matrix )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment