shubham0204/glove_embedding_create.py

## glove_embedding_create.py
import numpy as np
import pickle
import os

# You need to download the GloVe embeddings ( in txt files ) from here -> http://nlp.stanford.edu/data/glove.6B.zip

# The vocabulary size. It is the value of `len( tokenizer.word_index )+1`
vocab_size = int(input('Enter Vocabulary Size : '))

# The file to the GloVe file. For instance, "glove.6B/glove.6B.50d" where "50" represents the output dimension of the embedding.
glove_path = input('Enter path to GloVe text file : ')

# The filepath of the tf.keras.preprocessing.text.Tokenizer object in pickled form.
# This object was earlier used to tokenize the question and answer pairs.
tokenizer_path = input('Enter pickled Tokenizer path : ')

# The output directory where the files will be stored.
output_dir = input( 'Enter path of output directory : ')

# The output dimensions for the embedding. If the `glove_path` refers to `glove.6B.50d` then this argument should be equal to 50.
output_dim = int(input('Enter output dimension for Embedding : '))

# The filename for the saved numpy array. For example, "embedding_matrix.npy"
embedding_filename = input('Enter file name : ' )

# load the whole embedding into memory
embeddings_index = dict()
f = open( glove_path , encoding='utf8' )
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

tokenizer = pickle.load( open( tokenizer_path , 'rb' ) )

embedding_matrix = np.zeros((vocab_size, output_dim))
for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

np.save( os.path.join( output_dir , '{}'.format( embedding_filename )) , embedding_matrix )
	import numpy as np
	import pickle
	import os

	# You need to download the GloVe embeddings ( in txt files ) from here -> http://nlp.stanford.edu/data/glove.6B.zip

	# The vocabulary size. It is the value of `len( tokenizer.word_index )+1`
	vocab_size = int(input('Enter Vocabulary Size : '))

	# The file to the GloVe file. For instance, "glove.6B/glove.6B.50d" where "50" represents the output dimension of the embedding.
	glove_path = input('Enter path to GloVe text file : ')

	# The filepath of the tf.keras.preprocessing.text.Tokenizer object in pickled form.
	# This object was earlier used to tokenize the question and answer pairs.
	tokenizer_path = input('Enter pickled Tokenizer path : ')

	# The output directory where the files will be stored.
	output_dir = input( 'Enter path of output directory : ')

	# The output dimensions for the embedding. If the `glove_path` refers to `glove.6B.50d` then this argument should be equal to 50.
	output_dim = int(input('Enter output dimension for Embedding : '))

	# The filename for the saved numpy array. For example, "embedding_matrix.npy"
	embedding_filename = input('Enter file name : ' )

	# load the whole embedding into memory
	embeddings_index = dict()
	f = open( glove_path , encoding='utf8' )
	for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
	f.close()
	print('Loaded %s word vectors.' % len(embeddings_index))

	tokenizer = pickle.load( open( tokenizer_path , 'rb' ) )

	embedding_matrix = np.zeros((vocab_size, output_dim))
	for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
	embedding_matrix[i] = embedding_vector

	np.save( os.path.join( output_dir , '{}'.format( embedding_filename )) , embedding_matrix )