Created
April 20, 2023 15:45
-
-
Save bitsnaps/8dc7be650e84f8d4a166992adb82bea6 to your computer and use it in GitHub Desktop.
A small script to benchmark different models for text similarity using SentenceTransformer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics.pairwise import cosine_similarity | |
try: | |
import spacy | |
except: | |
!pip install spacy | |
import spacy | |
try: | |
from sentence_transformers import SentenceTransformer, util | |
except: | |
!pip install sentence-transformers | |
# SpaCy: Distance Similarity using Text Embeddings | |
# Load the pre-trained model | |
nlp = spacy.load("en_core_web_lg") | |
# best score: 0.86 (using: distilbert-base-nli-mean-tokens model) | |
sentence1 = 'The bottle is empty.' | |
sentence2 = 'There is nothing in the bottle.' | |
# best score: .95 (using: paraphrase-distilroberta-base-v1 model) | |
#sentence1 = 'Assisstant of the Director' | |
#sentence2 = "Director's Assisstant" | |
# Generate embeddings for the sentences | |
embedding1 = nlp(sentence1).vector | |
embedding2 = nlp(sentence2).vector | |
# Calculate the cosine similarity between the two embeddings | |
similarity = cosine_similarity([embedding1], [embedding2]) | |
print("Similarity between the two sentences: ", similarity[0][0]) | |
# SentenceTransformers (Semantic Similarity) | |
models = [ | |
# pre-trained models for English | |
'bert-base-nli-mean-tokens', # ~405Mb (BERT based, taking mean of tokens, good for general purposes), score: 0.7880 | |
'bert-base-nli-stsb-mean-tokens', # ~405Mb (similary to bert-base-nli-mean-tokens, fine-tuned on the STS-B benchmark dataset, good for similarity tasks), score: 0.7683 | |
'roberta-base-nli-mean-tokens', # ~ 448Mb (RoBERTa based, faster, generally better than BERT-based), score: 0.8046 | |
'distilbert-base-nli-mean-tokens', # ~245Mb (DistilBERT based, smaller faster version of BERT): score: 0.8646 | |
'paraphrase-distilroberta-base-v1', # ~263Mb (DistilRoBERTa based, for large paraphrase, designed for sentence similarity), score: 0.8170 | |
'stsb-roberta-base-v2', # ~ 448Mb, (RoBERTa based, fine-tuned on the STS-B benchmark, generally faster and more accurate than the BERT-based): score: 0.6706 | |
# support other langualges: | |
'xlm-r-100langs-bert-base-nli-mean-tokens', # ~1.03Gb (XLM-RoBERTa based, support 100 languges), score: 0.8356 | |
'xlm-r-100langs-bert-base-nli-stsb-mean-tokens', # ~1.03Gb (same as xlm-r-100langs-bert-base-nli-mean-tokens fine-tuned for STS-B benchmark), score: 0.8059 | |
'all-MiniLM-L6-v2', # 250Mb~ (MiniLM architectured, compact BERT), score: 0.6772 | |
] | |
def get_semantic_similarity(text1, text2, model_name): | |
# Load a pre-trained model | |
model = SentenceTransformer(model_name) | |
# Generate sentence embeddings for both sentences | |
embeddings = model.encode([text1, text2], show_progress_bar=False) | |
# Calculate the cosine similarity between the two embeddings | |
return util.cos_sim(embeddings[0], embeddings[1]) | |
for model in models: | |
similarity = get_semantic_similarity(sentence1, sentence2, model) | |
print(f"Semantic similarity using: {model} the two sentences: {similarity}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment