Skip to content

Instantly share code, notes, and snippets.

@bitsnaps
Created April 20, 2023 15:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bitsnaps/8dc7be650e84f8d4a166992adb82bea6 to your computer and use it in GitHub Desktop.
Save bitsnaps/8dc7be650e84f8d4a166992adb82bea6 to your computer and use it in GitHub Desktop.
A small script to benchmark different models for text similarity using SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
try:
import spacy
except:
!pip install spacy
import spacy
try:
from sentence_transformers import SentenceTransformer, util
except:
!pip install sentence-transformers
# SpaCy: Distance Similarity using Text Embeddings
# Load the pre-trained model
nlp = spacy.load("en_core_web_lg")
# best score: 0.86 (using: distilbert-base-nli-mean-tokens model)
sentence1 = 'The bottle is empty.'
sentence2 = 'There is nothing in the bottle.'
# best score: .95 (using: paraphrase-distilroberta-base-v1 model)
#sentence1 = 'Assisstant of the Director'
#sentence2 = "Director's Assisstant"
# Generate embeddings for the sentences
embedding1 = nlp(sentence1).vector
embedding2 = nlp(sentence2).vector
# Calculate the cosine similarity between the two embeddings
similarity = cosine_similarity([embedding1], [embedding2])
print("Similarity between the two sentences: ", similarity[0][0])
# SentenceTransformers (Semantic Similarity)
models = [
# pre-trained models for English
'bert-base-nli-mean-tokens', # ~405Mb (BERT based, taking mean of tokens, good for general purposes), score: 0.7880
'bert-base-nli-stsb-mean-tokens', # ~405Mb (similary to bert-base-nli-mean-tokens, fine-tuned on the STS-B benchmark dataset, good for similarity tasks), score: 0.7683
'roberta-base-nli-mean-tokens', # ~ 448Mb (RoBERTa based, faster, generally better than BERT-based), score: 0.8046
'distilbert-base-nli-mean-tokens', # ~245Mb (DistilBERT based, smaller faster version of BERT): score: 0.8646
'paraphrase-distilroberta-base-v1', # ~263Mb (DistilRoBERTa based, for large paraphrase, designed for sentence similarity), score: 0.8170
'stsb-roberta-base-v2', # ~ 448Mb, (RoBERTa based, fine-tuned on the STS-B benchmark, generally faster and more accurate than the BERT-based): score: 0.6706
# support other langualges:
'xlm-r-100langs-bert-base-nli-mean-tokens', # ~1.03Gb (XLM-RoBERTa based, support 100 languges), score: 0.8356
'xlm-r-100langs-bert-base-nli-stsb-mean-tokens', # ~1.03Gb (same as xlm-r-100langs-bert-base-nli-mean-tokens fine-tuned for STS-B benchmark), score: 0.8059
'all-MiniLM-L6-v2', # 250Mb~ (MiniLM architectured, compact BERT), score: 0.6772
]
def get_semantic_similarity(text1, text2, model_name):
# Load a pre-trained model
model = SentenceTransformer(model_name)
# Generate sentence embeddings for both sentences
embeddings = model.encode([text1, text2], show_progress_bar=False)
# Calculate the cosine similarity between the two embeddings
return util.cos_sim(embeddings[0], embeddings[1])
for model in models:
similarity = get_semantic_similarity(sentence1, sentence2, model)
print(f"Semantic similarity using: {model} the two sentences: {similarity}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment