Skip to content

Instantly share code, notes, and snippets.

@soaxelbrooke
Last active May 16, 2018 06:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save soaxelbrooke/32318ba4fc963810ec91832fccfd1b68 to your computer and use it in GitHub Desktop.
Save soaxelbrooke/32318ba4fc963810ec91832fccfd1b68 to your computer and use it in GitHub Desktop.
Word embeddings utility class for loading and transforming quickly.
import pandas
import numpy
import csv
from typing import List, Optional
class WordVectorizer:
def __init__(self, embeddings_path: str, embedding_dim: int, limit=None):
with open(embeddings_path) as infile:
# Skip header if this was produced by fasttext, which has metadata on first line
if next(infile).split(' ') == embedding_dim + 1:
infile.seek(0)
wv_df = pandas.read_csv(infile, header=None, delim_whitespace=True,
names=list(range(embedding_dim + 1)),
quoting=csv.QUOTE_NONE, nrows=limit)
self.vectors = wv_df.drop(columns=[0]).values.astype(float)
self.vectors = numpy.vstack([self.vectors, numpy.zeros((1, embedding_dim))])
self.word_to_idx = {word: idx for idx, word in enumerate(wv_df[0])}
def transform(self, tokens: List[str]) -> numpy.ndarray:
""" Transform a sentence into a matrix of word vectors. Zeros words not in embeddings. """
indexes = [self.word_to_idx[token] if token in self.word_to_idx else -1 for token in tokens]
return self.vectors[indexes]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment