Last active
June 23, 2020 09:56
-
-
Save shubham0204/531f75c37c49362c9b61a11865496177 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pickle | |
import os | |
# You need to download the GloVe embeddings ( in txt files ) from here -> http://nlp.stanford.edu/data/glove.6B.zip | |
# The vocabulary size. It is the value of `len( tokenizer.word_index )+1` | |
vocab_size = int(input('Enter Vocabulary Size : ')) | |
# The file to the GloVe file. For instance, "glove.6B/glove.6B.50d" where "50" represents the output dimension of the embedding. | |
glove_path = input('Enter path to GloVe text file : ') | |
# The filepath of the tf.keras.preprocessing.text.Tokenizer object in pickled form. | |
# This object was earlier used to tokenize the question and answer pairs. | |
tokenizer_path = input('Enter pickled Tokenizer path : ') | |
# The output directory where the files will be stored. | |
output_dir = input( 'Enter path of output directory : ') | |
# The output dimensions for the embedding. If the `glove_path` refers to `glove.6B.50d` then this argument should be equal to 50. | |
output_dim = int(input('Enter output dimension for Embedding : ')) | |
# The filename for the saved numpy array. For example, "embedding_matrix.npy" | |
embedding_filename = input('Enter file name : ' ) | |
# load the whole embedding into memory | |
embeddings_index = dict() | |
f = open( glove_path , encoding='utf8' ) | |
for line in f: | |
values = line.split() | |
word = values[0] | |
coefs = np.asarray(values[1:], dtype='float32') | |
embeddings_index[word] = coefs | |
f.close() | |
print('Loaded %s word vectors.' % len(embeddings_index)) | |
tokenizer = pickle.load( open( tokenizer_path , 'rb' ) ) | |
embedding_matrix = np.zeros((vocab_size, output_dim)) | |
for word, i in tokenizer.word_index.items(): | |
embedding_vector = embeddings_index.get(word) | |
if embedding_vector is not None: | |
embedding_matrix[i] = embedding_vector | |
np.save( os.path.join( output_dir , '{}'.format( embedding_filename )) , embedding_matrix ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment