Last active
February 18, 2022 13:51
-
-
Save MathiasGruber/fe14cd8eda75301ccc86b7d330e6f768 to your computer and use it in GitHub Desktop.
Embedding questions using sentence transformer model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer, AutoModel | |
def mean_pooling(model_output, attention_mask): | |
""" | |
Mean pooling to get sentence embeddings. See: | |
https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1 | |
""" | |
token_embeddings = model_output[0] | |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns | |
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) | |
return sum_embeddings / sum_mask | |
# Fetch the model & tokenizer from transformers library | |
model_name = 'sentence-transformers/stsb-roberta-large' | |
model = AutoModel.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Tokenize input | |
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors="pt") | |
# Create word embeddings | |
model_output = model(**encoded_input) | |
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence | |
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment