Created
January 11, 2018 08:26
-
-
Save jovianlin/0a6b7c58cde7a502a68914ba001c77bf to your computer and use it in GitHub Desktop.
load_glove_embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import numpy as np | |
def load_glove_embeddings(fp, embedding_dim, include_empty_char=True): | |
""" | |
Loads pre-trained word embeddings (GloVe embeddings) | |
Inputs: - fp: filepath of pre-trained glove embeddings | |
- embedding_dim: dimension of each vector embedding | |
- generate_matrix: whether to generate an embedding matrix | |
Outputs: | |
- word2coefs: Dictionary. Word to its corresponding coefficients | |
- word2index: Dictionary. Word to word-index | |
- embedding_matrix: Embedding matrix for Keras Embedding layer | |
""" | |
# First, build the "word2coefs" and "word2index" | |
word2coefs = {} # word to its corresponding coefficients | |
word2index = {} # word to word-index | |
with open(fp) as f: | |
for idx, line in enumerate(f): | |
try: | |
data = [x.strip().lower() for x in line.split()] | |
word = data[0] | |
coefs = np.asarray(data[1:embedding_dim+1], dtype='float32') | |
word2coefs[word] = coefs | |
if word not in word2index: | |
word2index[word] = len(word2index) | |
except Exception as e: | |
print('Exception occurred in `load_glove_embeddings`:', e) | |
continue | |
# End of for loop. | |
# End of with open | |
if include_empty_char: | |
word2index[''] = len(word2index) | |
# Second, build the "embedding_matrix" | |
# Words not found in embedding index will be all-zeros. Hence, the "+1". | |
vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs) | |
embedding_matrix = np.zeros((vocab_size, embedding_dim)) | |
for word, idx in word2index.items(): | |
embedding_vec = word2coefs.get(word) | |
if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim: | |
embedding_matrix[idx] = np.asarray(embedding_vec) | |
# return word2coefs, word2index, embedding_matrix | |
return word2index, np.asarray(embedding_matrix) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment