Forked from MathiasGruber/sts_sentence_embedding.py
Created
February 18, 2022 13:51
-
-
Save napoler/e64e4885067331be84d084fb769148a4 to your computer and use it in GitHub Desktop.
Embedding questions using sentence transformer model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer, AutoModel | |
def mean_pooling(model_output, attention_mask): | |
""" | |
Mean pooling to get sentence embeddings. See: | |
https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1 | |
""" | |
token_embeddings = model_output[0] | |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns | |
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) | |
return sum_embeddings / sum_mask | |
# Fetch the model & tokenizer from transformers library | |
model_name = 'sentence-transformers/stsb-roberta-large' | |
model = AutoModel.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Tokenize input | |
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors="pt") | |
# Create word embeddings | |
model_output = model(**encoded_input) | |
# Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence | |
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment