jovianlin/load_glove_embeddings.py

## load_glove_embeddings.py
# coding: utf-8

import numpy as np


def load_glove_embeddings(fp, embedding_dim, include_empty_char=True):
    """
    Loads pre-trained word embeddings (GloVe embeddings)
        Inputs: - fp: filepath of pre-trained glove embeddings
                - embedding_dim: dimension of each vector embedding
                - generate_matrix: whether to generate an embedding matrix
        Outputs:
                - word2coefs: Dictionary. Word to its corresponding coefficients
                - word2index: Dictionary. Word to word-index
                - embedding_matrix: Embedding matrix for Keras Embedding layer
    """
    # First, build the "word2coefs" and "word2index"
    word2coefs = {} # word to its corresponding coefficients
    word2index = {} # word to word-index
    with open(fp) as f:
        for idx, line in enumerate(f):
            try:
                data = [x.strip().lower() for x in line.split()]
                word = data[0]
                coefs = np.asarray(data[1:embedding_dim+1], dtype='float32')
                word2coefs[word] = coefs
                if word not in word2index:
                    word2index[word] = len(word2index)
            except Exception as e:
                print('Exception occurred in `load_glove_embeddings`:', e)
                continue
        # End of for loop.
    # End of with open
    if include_empty_char:
        word2index[''] = len(word2index)
    # Second, build the "embedding_matrix"
    # Words not found in embedding index will be all-zeros. Hence, the "+1".
    vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in word2index.items():
        embedding_vec = word2coefs.get(word)
        if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim:
            embedding_matrix[idx] = np.asarray(embedding_vec)
    # return word2coefs, word2index, embedding_matrix
    return word2index, np.asarray(embedding_matrix)
	# coding: utf-8

	import numpy as np


	def load_glove_embeddings(fp, embedding_dim, include_empty_char=True):
	"""
	Loads pre-trained word embeddings (GloVe embeddings)
	Inputs: - fp: filepath of pre-trained glove embeddings
	- embedding_dim: dimension of each vector embedding
	- generate_matrix: whether to generate an embedding matrix
	Outputs:
	- word2coefs: Dictionary. Word to its corresponding coefficients
	- word2index: Dictionary. Word to word-index
	- embedding_matrix: Embedding matrix for Keras Embedding layer
	"""
	# First, build the "word2coefs" and "word2index"
	word2coefs = {} # word to its corresponding coefficients
	word2index = {} # word to word-index
	with open(fp) as f:
	for idx, line in enumerate(f):
	try:
	data = [x.strip().lower() for x in line.split()]
	word = data[0]
	coefs = np.asarray(data[1:embedding_dim+1], dtype='float32')
	word2coefs[word] = coefs
	if word not in word2index:
	word2index[word] = len(word2index)
	except Exception as e:
	print('Exception occurred in `load_glove_embeddings`:', e)
	continue
	# End of for loop.
	# End of with open
	if include_empty_char:
	word2index[''] = len(word2index)
	# Second, build the "embedding_matrix"
	# Words not found in embedding index will be all-zeros. Hence, the "+1".
	vocab_size = len(word2coefs)+1 if include_empty_char else len(word2coefs)
	embedding_matrix = np.zeros((vocab_size, embedding_dim))
	for word, idx in word2index.items():
	embedding_vec = word2coefs.get(word)
	if embedding_vec is not None and embedding_vec.shape[0]==embedding_dim:
	embedding_matrix[idx] = np.asarray(embedding_vec)
	# return word2coefs, word2index, embedding_matrix
	return word2index, np.asarray(embedding_matrix)