Skip to content

Instantly share code, notes, and snippets.

@lasp73
Created July 4, 2022 15:03
Show Gist options
  • Save lasp73/522188b82539246450dbd43cacff8b0f to your computer and use it in GitHub Desktop.
Save lasp73/522188b82539246450dbd43cacff8b0f to your computer and use it in GitHub Desktop.
Helper classes to work with embeddings in scikit-learn
import logging
from abc import ABC, abstractmethod
from typing import Callable
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors, Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
logger = logging.getLogger(__name__)
class BaseEmbedding(ABC):
"""Adapter for different embeddings.
You must implement all methods according to your embedding model.
"""
@abstractmethod
def __getitem__(self, word):
pass
@abstractmethod
def __contains__(self, word):
pass
@abstractmethod
def vector_size(self):
"""Returns the embedding vector size."""
pass
class GensimEmbedding(BaseEmbedding):
"""Embeddings from Gensim."""
def __init__(self, model: KeyedVectors = None):
"""Creates embeddings from Gensim's KeyedVectors."""
self.model = model
def __getitem__(self, word):
return self.model[word]
def __contains__(self, word):
return word in self.model
def vector_size(self):
return self.model.vector_size
def load_file(self, file_path):
"""Load embeddings from Gensim format file."""
self.model = KeyedVectors.load(file_path)
return self
class Word2VecEmbedding(BaseEmbedding):
"""Embeddings from Word2Vec.
If no pretrained model file is informed, a new model is trained.
"""
def __getitem__(self, word):
return self.model[word]
def __contains__(self, word):
return word in self.model
def vector_size(self):
return self.model.vector_size
def train(self, tokens, **params):
# X_split = [tokens_sentences for tokens_sentence in tokens]
temp_model = Word2Vec(tokens.to_list(), **params)
self.model = temp_model.wv
del temp_model
return self
def load_file(self, file_path, file_binary=False):
self.model = KeyedVectors.load_word2vec_format(file_path, binary=file_binary)
self.trainable = False
return self
class GloveEmbedding(BaseEmbedding):
"""Embeddings from Glove text format"""
def __getitem__(self, word):
return self.model[word]
def __contains__(self, word):
return word in self.model
def vector_size(self):
return self.vector_dim
def load_file(self, file_path):
embeddings_dict = {}
logger.info("Loading Glove embeddings from: %s", file_path)
with open(file_path, mode="r", encoding="utf-8") as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], "float32")
embeddings_dict[word] = vector
# Taking the vector size from last word
self.vector_dim = len(vector)
logger.info(
"Loaded Glove embeddings [%d, %d]!", len(embeddings_dict), self.vector_dim
)
self.model = embeddings_dict
return self
class MeanEmbeddingVectorizer(BaseEstimator, TransformerMixin):
"""Vectorizer to calculate the sentence embedding from the
average word vectors.
"""
def __init__(self, model: BaseEmbedding, tokenizer=None):
self.model = model
self.tokenizer = str.split if tokenizer is None else tokenizer
def fit(self, X, y=None):
return self
def transform(self, X):
# Here we are taking the mean
return np.array(
[
np.mean(
[self.model[token] for token in self.tokenizer(sentence) if token in self.model]
or [np.zeros(self.model.vector_size())],
axis=0,
)
for sentence in X
]
)
class WeightedEmbeddingVectorizer(BaseEstimator, TransformerMixin):
"""Vectorizer to calculate the sentence embedding from the
weighted word vectors based on tf-idf.
If you inform label vector (Y), grouped tf-idf will be used.
"""
def __init__(self, model: BaseEmbedding, tokenizer=None):
self.model = model
self.tokenizer = str.split if tokenizer is None else tokenizer
def fit(self, X, y=None):
self.__build_tfidf_vectorizer(X, y)
return self
def __build_tfidf_vectorizer(self, X, y=None):
if y is None:
logger.info("Tf-idf using independent sentences")
X_new = X
else:
logger.info("Tf-idf using grouped sentences")
X_new = (
pd.DataFrame({"x": X, "y": y})
.groupby("y", as_index=False)
.agg(" ".join)["x"]
.values.astype(str)
)
self.tfidf_vectorizer = TfidfVectorizer()
self.tfidf_vectorizer.fit(X_new)
self.feature_names = {
w: i for (i, w) in enumerate(self.tfidf_vectorizer.get_feature_names())
}
def transform(self, X):
# Here we are taking the weighted mean using tf-idf
self.tfidf_feats = self.tfidf_vectorizer.transform(X)
return np.array(
[
np.mean(
[
(
self.model[w]
* self.tfidf_feats[sent_id, self.feature_names[w]]
)
for w in self.tokenizer(words)
if w in self.model and w in self.feature_names
]
or [np.zeros(self.model.vector_size())],
axis=0,
)
for sent_id, words in enumerate(X)
]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment