Code for the Medium post Link
Last active
February 17, 2024 08:32
-
-
Save stephenleo/b1b09bb70443d98b01e96332f6f687d2 to your computer and use it in GitHub Desktop.
[Medium] Semantic Textual Similariy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create the necessary directories | |
mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp | |
# Create and activate a conda environment | |
conda create -n semantic_similarity python=3.8 | |
conda activate semantic_similarity | |
# Pip install the necessary libraries | |
pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets | |
pip install -U numpy spacy textdistance fasttext gensim | |
pip install -U tensorflow tensorflow_hub sentence-transformers openai | |
conda install pyemd | |
# Download the Spacy Model | |
python -m spacy download en_core_web_sm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Imports | |
from datasets import load_dataset | |
import pandas as pd | |
import numpy as np | |
from tqdm import tqdm | |
tqdm.pandas() | |
# Load the English STSB dataset | |
stsb_dataset = load_dataset('stsb_multi_mt', 'en') | |
stsb_train = pd.DataFrame(stsb_dataset['train']) | |
stsb_test = pd.DataFrame(stsb_dataset['test']) | |
# Check loaded data | |
print(stsb_train.shape, stsb_test.shape) | |
stsb_test.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics.pairwise import cosine_similarity | |
import spacy | |
nlp = spacy.load("en_core_web_sm") | |
def text_processing(sentence): | |
""" | |
Lemmatize, lowercase, remove numbers and stop words | |
Args: | |
sentence: The sentence we want to process. | |
Returns: | |
A list of processed words | |
""" | |
sentence = [token.lemma_.lower() | |
for token in nlp(sentence) | |
if token.is_alpha and not token.is_stop] | |
return sentence | |
def cos_sim(sentence1_emb, sentence2_emb): | |
""" | |
Cosine similarity between two columns of sentence embeddings | |
Args: | |
sentence1_emb: sentence1 embedding column | |
sentence2_emb: sentence2 embedding column | |
Returns: | |
The row-wise cosine similarity between the two columns. | |
For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z] | |
Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)] | |
""" | |
cos_sim = cosine_similarity(sentence1_emb, sentence2_emb) | |
return np.diag(cos_sim) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import textdistance | |
def jaccard_sim(row): | |
# Text Processing | |
sentence1 = text_processing(row['sentence1']) | |
sentence2 = text_processing(row['sentence2']) | |
# Jaccard similarity | |
return textdistance.jaccard.normalized_similarity(sentence1, sentence2) | |
# Jaccard Similarity | |
stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer | |
model = TfidfVectorizer(lowercase=True, stop_words='english') | |
# Train the model | |
X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique() | |
model.fit(X_train) | |
# Generate Embeddings on Test | |
sentence1_emb = model.transform(stsb_test['sentence1']) | |
sentence2_emb = model.transform(stsb_test['sentence2']) | |
# Cosine Similarity | |
stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gensim.downloader as api | |
# Load the pre-trained model | |
model = api.load('fasttext-wiki-news-subwords-300') | |
def word_movers_distance(row): | |
# Text Processing | |
sentence1 = text_processing(row['sentence1']) | |
sentence2 = text_processing(row['sentence2']) | |
# Negative Word Movers Distance | |
return -model.wmdistance(sentence1, sentence2) | |
# Negative Word Movers Distance | |
stsb_test['NegWMD_score'] = stsb_test.progress_apply(word_movers_distance, axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import tensorflow_hub as hub | |
# Load the pre-trained model | |
gpus = tf.config.list_physical_devices('GPU') | |
for gpu in gpus: | |
# Control GPU memory usage | |
tf.config.experimental.set_memory_growth(gpu, True) | |
module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4' | |
model = hub.load(module_url) | |
# Generate Embeddings | |
sentence1_emb = model(stsb_test['sentence1']).numpy() | |
sentence2_emb = model(stsb_test['sentence2']).numpy() | |
# Cosine Similarity | |
stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sentence_transformers import CrossEncoder | |
# Load the pre-trained model | |
model = CrossEncoder('cross-encoder/stsb-roberta-base') | |
sentence_pairs = [] | |
for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']): | |
sentence_pairs.append([sentence1, sentence2]) | |
stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sentence_transformers import SentenceTransformer | |
# Load the pre-trained model | |
model = SentenceTransformer('stsb-mpnet-base-v2') | |
# Generate Embeddings | |
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True) | |
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True) | |
# Cosine Similarity | |
stsb_test['SBERT BiEncoder_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########## Supervised ########## | |
# Load the pre-trained model | |
model = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large') | |
# Generate Embeddings | |
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True) | |
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True) | |
# Cosine Similarity | |
stsb_test['SimCSE Supervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) | |
########## Un-Supervised ########## | |
# Load the pre-trained model | |
model = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large') | |
# Generate Embeddings | |
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True) | |
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True) | |
# Cosine Similarity | |
stsb_test['SimCSE Unsupervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import openai | |
import os | |
import pickle | |
client = openai.OpenAI(api_key='update_your_openai_API_key_here') | |
models = ["ada-002", "3-small", "3-large"] | |
for model in models: | |
if os.path.exists(f"{model}.pkl"): | |
print(f"Loading OpenAI {model} Embeddings") | |
with open(f"{model}.pkl", "rb") as f: | |
openai_emb = pickle.load(f) | |
else: | |
print(f"Querying OpenAI {model} Embeddings") | |
openai_emb = {} | |
unique_sentences = list( | |
set( | |
stsb_test["sentence1"].values.tolist() | |
+ stsb_test["sentence2"].values.tolist() | |
) | |
) | |
for sentence in tqdm(unique_sentences): | |
if sentence not in openai_emb.keys(): | |
response = client.embeddings.create( | |
input=sentence, model=f"text-embedding-{model}" | |
) | |
openai_emb[sentence] = response.data[0].embedding | |
with open(f"{model}.pkl", "wb") as f: | |
pickle.dump(openai_emb, f) | |
# Generate Embeddings | |
sentence1_emb = [openai_emb[sentence] for sentence in stsb_test["sentence1"]] | |
sentence2_emb = [openai_emb[sentence] for sentence in stsb_test["sentence2"]] | |
# Cosine Similarity | |
stsb_test[f"OpenAI {model}_cosine_score"] = cos_sim(sentence1_emb, sentence2_emb) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
score_cols = [col for col in stsb_test.columns if '_score' in col] | |
# Spearman Rank Correlation | |
spearman_rank_corr = stsb_test[score_cols].corr(method='spearman').iloc[1:, 0:1]*100 | |
spearman_rank_corr.head(10) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from plotly.subplots import make_subplots | |
import plotly.graph_objects as go | |
nrows = 4 | |
ncols = 3 | |
plot_array = np.arange(0, nrows*ncols).reshape(nrows, ncols) | |
subplot_titles = [f'{row.Index.split("_")[0]}: {row.similarity_score:.2f}' for row in spearman_rank_corr.itertuples()] | |
fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=subplot_titles) | |
for index, score in enumerate(spearman_rank_corr.index): | |
row, col = np.argwhere(plot_array == index)[0] | |
fig.add_trace( | |
go.Scatter( | |
x=stsb_test[score_cols[0]], | |
y=stsb_test[score], | |
mode='markers', | |
), | |
row=row+1, col=col+1 | |
) | |
fig.update_layout(height=700, width=1000, title_text='Spearman Rank Correlation (ρ × 100)', showlegend=False) | |
fig.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment