Skip to content

Instantly share code, notes, and snippets.

@stephenleo
Last active February 17, 2024 08:32
Show Gist options
  • Save stephenleo/b1b09bb70443d98b01e96332f6f687d2 to your computer and use it in GitHub Desktop.
Save stephenleo/b1b09bb70443d98b01e96332f6f687d2 to your computer and use it in GitHub Desktop.
[Medium] Semantic Textual Similariy

Semantic Textual Similarity

Code for the Medium post Link

# Create the necessary directories
mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp
# Create and activate a conda environment
conda create -n semantic_similarity python=3.8
conda activate semantic_similarity
# Pip install the necessary libraries
pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets
pip install -U numpy spacy textdistance fasttext gensim
pip install -U tensorflow tensorflow_hub sentence-transformers openai
conda install pyemd
# Download the Spacy Model
python -m spacy download en_core_web_sm
# Imports
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
# Load the English STSB dataset
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])
# Check loaded data
print(stsb_train.shape, stsb_test.shape)
stsb_test.head()
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")
def text_processing(sentence):
"""
Lemmatize, lowercase, remove numbers and stop words
Args:
sentence: The sentence we want to process.
Returns:
A list of processed words
"""
sentence = [token.lemma_.lower()
for token in nlp(sentence)
if token.is_alpha and not token.is_stop]
return sentence
def cos_sim(sentence1_emb, sentence2_emb):
"""
Cosine similarity between two columns of sentence embeddings
Args:
sentence1_emb: sentence1 embedding column
sentence2_emb: sentence2 embedding column
Returns:
The row-wise cosine similarity between the two columns.
For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
"""
cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
return np.diag(cos_sim)
import textdistance
def jaccard_sim(row):
# Text Processing
sentence1 = text_processing(row['sentence1'])
sentence2 = text_processing(row['sentence2'])
# Jaccard similarity
return textdistance.jaccard.normalized_similarity(sentence1, sentence2)
# Jaccard Similarity
stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1)
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(lowercase=True, stop_words='english')
# Train the model
X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique()
model.fit(X_train)
# Generate Embeddings on Test
sentence1_emb = model.transform(stsb_test['sentence1'])
sentence2_emb = model.transform(stsb_test['sentence2'])
# Cosine Similarity
stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
import gensim.downloader as api
# Load the pre-trained model
model = api.load('fasttext-wiki-news-subwords-300')
def word_movers_distance(row):
# Text Processing
sentence1 = text_processing(row['sentence1'])
sentence2 = text_processing(row['sentence2'])
# Negative Word Movers Distance
return -model.wmdistance(sentence1, sentence2)
# Negative Word Movers Distance
stsb_test['NegWMD_score'] = stsb_test.progress_apply(word_movers_distance, axis=1)
import tensorflow as tf
import tensorflow_hub as hub
# Load the pre-trained model
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
# Control GPU memory usage
tf.config.experimental.set_memory_growth(gpu, True)
module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
model = hub.load(module_url)
# Generate Embeddings
sentence1_emb = model(stsb_test['sentence1']).numpy()
sentence2_emb = model(stsb_test['sentence2']).numpy()
# Cosine Similarity
stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
from sentence_transformers import CrossEncoder
# Load the pre-trained model
model = CrossEncoder('cross-encoder/stsb-roberta-base')
sentence_pairs = []
for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']):
sentence_pairs.append([sentence1, sentence2])
stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True)
from sentence_transformers import SentenceTransformer
# Load the pre-trained model
model = SentenceTransformer('stsb-mpnet-base-v2')
# Generate Embeddings
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)
# Cosine Similarity
stsb_test['SBERT BiEncoder_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
########## Supervised ##########
# Load the pre-trained model
model = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large')
# Generate Embeddings
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)
# Cosine Similarity
stsb_test['SimCSE Supervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
########## Un-Supervised ##########
# Load the pre-trained model
model = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large')
# Generate Embeddings
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)
# Cosine Similarity
stsb_test['SimCSE Unsupervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
import openai
import os
import pickle
client = openai.OpenAI(api_key='update_your_openai_API_key_here')
models = ["ada-002", "3-small", "3-large"]
for model in models:
if os.path.exists(f"{model}.pkl"):
print(f"Loading OpenAI {model} Embeddings")
with open(f"{model}.pkl", "rb") as f:
openai_emb = pickle.load(f)
else:
print(f"Querying OpenAI {model} Embeddings")
openai_emb = {}
unique_sentences = list(
set(
stsb_test["sentence1"].values.tolist()
+ stsb_test["sentence2"].values.tolist()
)
)
for sentence in tqdm(unique_sentences):
if sentence not in openai_emb.keys():
response = client.embeddings.create(
input=sentence, model=f"text-embedding-{model}"
)
openai_emb[sentence] = response.data[0].embedding
with open(f"{model}.pkl", "wb") as f:
pickle.dump(openai_emb, f)
# Generate Embeddings
sentence1_emb = [openai_emb[sentence] for sentence in stsb_test["sentence1"]]
sentence2_emb = [openai_emb[sentence] for sentence in stsb_test["sentence2"]]
# Cosine Similarity
stsb_test[f"OpenAI {model}_cosine_score"] = cos_sim(sentence1_emb, sentence2_emb)
score_cols = [col for col in stsb_test.columns if '_score' in col]
# Spearman Rank Correlation
spearman_rank_corr = stsb_test[score_cols].corr(method='spearman').iloc[1:, 0:1]*100
spearman_rank_corr.head(10)
from plotly.subplots import make_subplots
import plotly.graph_objects as go
nrows = 4
ncols = 3
plot_array = np.arange(0, nrows*ncols).reshape(nrows, ncols)
subplot_titles = [f'{row.Index.split("_")[0]}: {row.similarity_score:.2f}' for row in spearman_rank_corr.itertuples()]
fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=subplot_titles)
for index, score in enumerate(spearman_rank_corr.index):
row, col = np.argwhere(plot_array == index)[0]
fig.add_trace(
go.Scatter(
x=stsb_test[score_cols[0]],
y=stsb_test[score],
mode='markers',
),
row=row+1, col=col+1
)
fig.update_layout(height=700, width=1000, title_text='Spearman Rank Correlation (ρ × 100)', showlegend=False)
fig.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment