Instantly share code, notes, and snippets.

Embed
What would you like to do?
load_glove_embeddings
# coding: utf-8
import numpy as np
def load_glove_embeddings(fp, embedding_dim, include_empty_char=True):
"""
Loads pre-trained word embeddings (GloVe embeddings)
Inputs: - fp: filepath of pre-trained glove embeddings
- embedding_dim: dimension of each vector embedding
- generate_matrix: whether to generate an embedding matrix
Outputs:
- word2coefs: Dictionary. Word to its corresponding coefficients
- word2index: Dictionary. Word to word-index
- embedding_matrix: Embedding matrix for Keras Embedding layer
"""
# First, build the "word2coefs" and "word2index"
word2coefs = {} # word to its corresponding coefficients
word2index = {} # word to word-index
with open(fp) as f:
for idx, line in enumerate(f):
try:
data = [x.strip().lower() for x in line.split()]
word = data[0]
coefs = np.asarray(data[1:embedding_dim+1], dtype='float32')
word2coefs[word] = coefs
if word not in word2index:
word2index[word] = len(word2index)
except Exception as e:
print('Exception occurred in `load_glove_embeddings`:', e)
continue
# End of for loop.
# End of with open
if include_empty_char:
word2index[''] = len(word2index)
# Second, build the "embedding_matrix"
# Words not found in embedding index will be all-zeros. Hence, the "+1".
vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs)
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in word2index.items():
embedding_vec = word2coefs.get(word)
if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim:
embedding_matrix[idx] = np.asarray(embedding_vec)
# return word2coefs, word2index, embedding_matrix
return word2index, np.asarray(embedding_matrix)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment