bitsnaps/text_similarity

## text_similarity
from sklearn.metrics.pairwise import cosine_similarity

try:
  import spacy
except:
  !pip install spacy
  import spacy

try:
  from sentence_transformers import SentenceTransformer, util
except:
  !pip install sentence-transformers


# SpaCy: Distance Similarity using Text Embeddings

# Load the pre-trained model
nlp = spacy.load("en_core_web_lg")

# best score: 0.86 (using: distilbert-base-nli-mean-tokens model)
sentence1 = 'The bottle is empty.'
sentence2 = 'There is nothing in the bottle.'

# best score: .95 (using: paraphrase-distilroberta-base-v1 model)
#sentence1 = 'Assisstant of the Director'
#sentence2 = "Director's Assisstant"

# Generate embeddings for the sentences
embedding1 = nlp(sentence1).vector
embedding2 = nlp(sentence2).vector

# Calculate the cosine similarity between the two embeddings
similarity = cosine_similarity([embedding1], [embedding2])

print("Similarity between the two sentences: ", similarity[0][0])

# SentenceTransformers (Semantic Similarity)
models = [
    # pre-trained models for English
    'bert-base-nli-mean-tokens', # ~405Mb (BERT based, taking mean of tokens, good for general purposes), score: 0.7880
    'bert-base-nli-stsb-mean-tokens', # ~405Mb (similary to bert-base-nli-mean-tokens, fine-tuned on the STS-B benchmark dataset, good for similarity tasks), score: 0.7683
    'roberta-base-nli-mean-tokens', # ~ 448Mb (RoBERTa based, faster, generally better than BERT-based), score: 0.8046
    'distilbert-base-nli-mean-tokens', # ~245Mb (DistilBERT based, smaller faster version of BERT): score: 0.8646
    'paraphrase-distilroberta-base-v1', # ~263Mb (DistilRoBERTa based, for large paraphrase, designed for sentence similarity), score: 0.8170
    'stsb-roberta-base-v2', # ~ 448Mb, (RoBERTa based, fine-tuned on the STS-B benchmark, generally faster and more accurate than the BERT-based): score: 0.6706
    # support other langualges:
    'xlm-r-100langs-bert-base-nli-mean-tokens', # ~1.03Gb (XLM-RoBERTa based, support 100 languges), score: 0.8356
    'xlm-r-100langs-bert-base-nli-stsb-mean-tokens', # ~1.03Gb (same as xlm-r-100langs-bert-base-nli-mean-tokens fine-tuned for STS-B benchmark), score: 0.8059
    'all-MiniLM-L6-v2', # 250Mb~ (MiniLM architectured, compact BERT), score: 0.6772
]

def get_semantic_similarity(text1, text2, model_name):
  # Load a pre-trained model
  model = SentenceTransformer(model_name)
  # Generate sentence embeddings for both sentences
  embeddings = model.encode([text1, text2], show_progress_bar=False)
  # Calculate the cosine similarity between the two embeddings
  return util.cos_sim(embeddings[0], embeddings[1])

for model in models:
  similarity = get_semantic_similarity(sentence1, sentence2, model)
  print(f"Semantic similarity using: {model} the two sentences: {similarity}")
	from sklearn.metrics.pairwise import cosine_similarity

	try:
	import spacy
	except:
	!pip install spacy
	import spacy

	try:
	from sentence_transformers import SentenceTransformer, util
	except:
	!pip install sentence-transformers


	# SpaCy: Distance Similarity using Text Embeddings

	# Load the pre-trained model
	nlp = spacy.load("en_core_web_lg")

	# best score: 0.86 (using: distilbert-base-nli-mean-tokens model)
	sentence1 = 'The bottle is empty.'
	sentence2 = 'There is nothing in the bottle.'

	# best score: .95 (using: paraphrase-distilroberta-base-v1 model)
	#sentence1 = 'Assisstant of the Director'
	#sentence2 = "Director's Assisstant"

	# Generate embeddings for the sentences
	embedding1 = nlp(sentence1).vector
	embedding2 = nlp(sentence2).vector

	# Calculate the cosine similarity between the two embeddings
	similarity = cosine_similarity([embedding1], [embedding2])

	print("Similarity between the two sentences: ", similarity[0][0])

	# SentenceTransformers (Semantic Similarity)
	models = [
	# pre-trained models for English
	'bert-base-nli-mean-tokens', # ~405Mb (BERT based, taking mean of tokens, good for general purposes), score: 0.7880
	'bert-base-nli-stsb-mean-tokens', # ~405Mb (similary to bert-base-nli-mean-tokens, fine-tuned on the STS-B benchmark dataset, good for similarity tasks), score: 0.7683
	'roberta-base-nli-mean-tokens', # ~ 448Mb (RoBERTa based, faster, generally better than BERT-based), score: 0.8046
	'distilbert-base-nli-mean-tokens', # ~245Mb (DistilBERT based, smaller faster version of BERT): score: 0.8646
	'paraphrase-distilroberta-base-v1', # ~263Mb (DistilRoBERTa based, for large paraphrase, designed for sentence similarity), score: 0.8170
	'stsb-roberta-base-v2', # ~ 448Mb, (RoBERTa based, fine-tuned on the STS-B benchmark, generally faster and more accurate than the BERT-based): score: 0.6706
	# support other langualges:
	'xlm-r-100langs-bert-base-nli-mean-tokens', # ~1.03Gb (XLM-RoBERTa based, support 100 languges), score: 0.8356
	'xlm-r-100langs-bert-base-nli-stsb-mean-tokens', # ~1.03Gb (same as xlm-r-100langs-bert-base-nli-mean-tokens fine-tuned for STS-B benchmark), score: 0.8059
	'all-MiniLM-L6-v2', # 250Mb~ (MiniLM architectured, compact BERT), score: 0.6772
	]

	def get_semantic_similarity(text1, text2, model_name):
	# Load a pre-trained model
	model = SentenceTransformer(model_name)
	# Generate sentence embeddings for both sentences
	embeddings = model.encode([text1, text2], show_progress_bar=False)
	# Calculate the cosine similarity between the two embeddings
	return util.cos_sim(embeddings[0], embeddings[1])

	for model in models:
	similarity = get_semantic_similarity(sentence1, sentence2, model)
	print(f"Semantic similarity using: {model} the two sentences: {similarity}")