stephenleo/00_sts.md

## 00_sts.md

      
    Raw
  

              00_sts.md
            
          
    Semantic Textual Similarity

Code for the Medium post Link

  
## 01_setup.sh
# Create the necessary directories
mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp

# Create and activate a conda environment
conda create -n semantic_similarity python=3.8
conda activate semantic_similarity

# Pip install the necessary libraries
pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets
pip install -U numpy spacy textdistance fasttext gensim
pip install -U tensorflow tensorflow_hub sentence-transformers openai
conda install pyemd

# Download the Spacy Model
python -m spacy download en_core_web_sm

## 02_download_data.py
# Imports
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

# Load the English STSB dataset
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])

# Check loaded data
print(stsb_train.shape, stsb_test.shape)
stsb_test.head()

## 03_helpers.py
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")

def text_processing(sentence):
    """
    Lemmatize, lowercase, remove numbers and stop words

    Args:
      sentence: The sentence we want to process.

    Returns:
      A list of processed words
    """
    sentence = [token.lemma_.lower()
                for token in nlp(sentence)
                if token.is_alpha and not token.is_stop]

    return sentence


def cos_sim(sentence1_emb, sentence2_emb):
    """
    Cosine similarity between two columns of sentence embeddings

    Args:
      sentence1_emb: sentence1 embedding column
      sentence2_emb: sentence2 embedding column

    Returns:
      The row-wise cosine similarity between the two columns.
      For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
      Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
    """
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)

## 04_jaccard.py
import textdistance

def jaccard_sim(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])

    # Jaccard similarity
    return textdistance.jaccard.normalized_similarity(sentence1, sentence2)


# Jaccard Similarity
stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1)

## 05_tfidf.py
from sklearn.feature_extraction.text import TfidfVectorizer
model = TfidfVectorizer(lowercase=True, stop_words='english')

# Train the model
X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique()
model.fit(X_train)

# Generate Embeddings on Test
sentence1_emb = model.transform(stsb_test['sentence1'])
sentence2_emb = model.transform(stsb_test['sentence2'])

# Cosine Similarity
stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

## 06_wmd.py
import gensim.downloader as api

# Load the pre-trained model
model = api.load('fasttext-wiki-news-subwords-300')

def word_movers_distance(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])

    # Negative Word Movers Distance
    return -model.wmdistance(sentence1, sentence2)


# Negative Word Movers Distance
stsb_test['NegWMD_score'] = stsb_test.progress_apply(word_movers_distance, axis=1)

## 07_use.py
import tensorflow as tf
import tensorflow_hub as hub

# Load the pre-trained model
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    # Control GPU memory usage
    tf.config.experimental.set_memory_growth(gpu, True)

module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
model = hub.load(module_url)

# Generate Embeddings
sentence1_emb = model(stsb_test['sentence1']).numpy()
sentence2_emb = model(stsb_test['sentence2']).numpy()

# Cosine Similarity
stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

## 08_cross_encoder.py
from sentence_transformers import CrossEncoder

# Load the pre-trained model
model = CrossEncoder('cross-encoder/stsb-roberta-base')

sentence_pairs = []
for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']):
    sentence_pairs.append([sentence1, sentence2])

stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True)

## 09_bi_encoder.py
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer('stsb-mpnet-base-v2')

# Generate Embeddings
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

# Cosine Similarity
stsb_test['SBERT BiEncoder_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

## 10_simcse.py
########## Supervised ##########
# Load the pre-trained model
model = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large')

# Generate Embeddings
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

# Cosine Similarity
stsb_test['SimCSE Supervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)


########## Un-Supervised ##########
# Load the pre-trained model
model = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large')

# Generate Embeddings
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

# Cosine Similarity
stsb_test['SimCSE Unsupervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

## 11_openai.py
import openai
import os
import pickle

client = openai.OpenAI(api_key='update_your_openai_API_key_here')

models = ["ada-002", "3-small", "3-large"]

for model in models:
    if os.path.exists(f"{model}.pkl"):
        print(f"Loading OpenAI {model} Embeddings")
        with open(f"{model}.pkl", "rb") as f:
            openai_emb = pickle.load(f)

    else:
        print(f"Querying OpenAI {model} Embeddings")
        openai_emb = {}

        unique_sentences = list(
            set(
                stsb_test["sentence1"].values.tolist()
                + stsb_test["sentence2"].values.tolist()
            )
        )

        for sentence in tqdm(unique_sentences):
            if sentence not in openai_emb.keys():
                response = client.embeddings.create(
                    input=sentence, model=f"text-embedding-{model}"
                )
                openai_emb[sentence] = response.data[0].embedding

        with open(f"{model}.pkl", "wb") as f:
            pickle.dump(openai_emb, f)

    # Generate Embeddings
    sentence1_emb = [openai_emb[sentence] for sentence in stsb_test["sentence1"]]
    sentence2_emb = [openai_emb[sentence] for sentence in stsb_test["sentence2"]]

    # Cosine Similarity
    stsb_test[f"OpenAI {model}_cosine_score"] = cos_sim(sentence1_emb, sentence2_emb)

## 12_spearman_rank_correlation.py
score_cols = [col for col in stsb_test.columns if '_score' in col]

# Spearman Rank Correlation
spearman_rank_corr = stsb_test[score_cols].corr(method='spearman').iloc[1:, 0:1]*100
spearman_rank_corr.head(10)

## 13_plot.py
from plotly.subplots import make_subplots
import plotly.graph_objects as go

nrows = 4
ncols = 3
plot_array = np.arange(0, nrows*ncols).reshape(nrows, ncols)

subplot_titles = [f'{row.Index.split("_")[0]}: {row.similarity_score:.2f}' for row in spearman_rank_corr.itertuples()]
fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=subplot_titles)

for index, score in enumerate(spearman_rank_corr.index):
    row, col = np.argwhere(plot_array == index)[0]

    fig.add_trace(
        go.Scatter(
            x=stsb_test[score_cols[0]],
            y=stsb_test[score],
            mode='markers',
        ),
        row=row+1, col=col+1
    )


fig.update_layout(height=700, width=1000, title_text='Spearman Rank Correlation (ρ × 100)', showlegend=False)
fig.show()
	# Create the necessary directories
	mkdir -p semantic_similarity/notebooks semantic_similarity/data/nlp

	# Create and activate a conda environment
	conda create -n semantic_similarity python=3.8
	conda activate semantic_similarity

	# Pip install the necessary libraries
	pip install -U jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets
	pip install -U numpy spacy textdistance fasttext gensim
	pip install -U tensorflow tensorflow_hub sentence-transformers openai
	conda install pyemd

	# Download the Spacy Model
	python -m spacy download en_core_web_sm
	# Imports
	from datasets import load_dataset
	import pandas as pd
	import numpy as np
	from tqdm import tqdm
	tqdm.pandas()

	# Load the English STSB dataset
	stsb_dataset = load_dataset('stsb_multi_mt', 'en')
	stsb_train = pd.DataFrame(stsb_dataset['train'])
	stsb_test = pd.DataFrame(stsb_dataset['test'])

	# Check loaded data
	print(stsb_train.shape, stsb_test.shape)
	stsb_test.head()
	from sklearn.metrics.pairwise import cosine_similarity
	import spacy
	nlp = spacy.load("en_core_web_sm")

	def text_processing(sentence):
	"""
	Lemmatize, lowercase, remove numbers and stop words

	Args:
	sentence: The sentence we want to process.

	Returns:
	A list of processed words
	"""
	sentence = [token.lemma_.lower()
	for token in nlp(sentence)
	if token.is_alpha and not token.is_stop]

	return sentence


	def cos_sim(sentence1_emb, sentence2_emb):
	"""
	Cosine similarity between two columns of sentence embeddings

	Args:
	sentence1_emb: sentence1 embedding column
	sentence2_emb: sentence2 embedding column

	Returns:
	The row-wise cosine similarity between the two columns.
	For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
	Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
	"""
	cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
	return np.diag(cos_sim)
	import textdistance

	def jaccard_sim(row):
	# Text Processing
	sentence1 = text_processing(row['sentence1'])
	sentence2 = text_processing(row['sentence2'])

	# Jaccard similarity
	return textdistance.jaccard.normalized_similarity(sentence1, sentence2)


	# Jaccard Similarity
	stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1)
	from sklearn.feature_extraction.text import TfidfVectorizer
	model = TfidfVectorizer(lowercase=True, stop_words='english')

	# Train the model
	X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique()
	model.fit(X_train)

	# Generate Embeddings on Test
	sentence1_emb = model.transform(stsb_test['sentence1'])
	sentence2_emb = model.transform(stsb_test['sentence2'])

	# Cosine Similarity
	stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
	import gensim.downloader as api

	# Load the pre-trained model
	model = api.load('fasttext-wiki-news-subwords-300')

	def word_movers_distance(row):
	# Text Processing
	sentence1 = text_processing(row['sentence1'])
	sentence2 = text_processing(row['sentence2'])

	# Negative Word Movers Distance
	return -model.wmdistance(sentence1, sentence2)


	# Negative Word Movers Distance
	stsb_test['NegWMD_score'] = stsb_test.progress_apply(word_movers_distance, axis=1)
	import tensorflow as tf
	import tensorflow_hub as hub

	# Load the pre-trained model
	gpus = tf.config.list_physical_devices('GPU')
	for gpu in gpus:
	# Control GPU memory usage
	tf.config.experimental.set_memory_growth(gpu, True)

	module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
	model = hub.load(module_url)

	# Generate Embeddings
	sentence1_emb = model(stsb_test['sentence1']).numpy()
	sentence2_emb = model(stsb_test['sentence2']).numpy()

	# Cosine Similarity
	stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
	from sentence_transformers import CrossEncoder

	# Load the pre-trained model
	model = CrossEncoder('cross-encoder/stsb-roberta-base')

	sentence_pairs = []
	for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']):
	sentence_pairs.append([sentence1, sentence2])

	stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True)
	from sentence_transformers import SentenceTransformer

	# Load the pre-trained model
	model = SentenceTransformer('stsb-mpnet-base-v2')

	# Generate Embeddings
	sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
	sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

	# Cosine Similarity
	stsb_test['SBERT BiEncoder_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)
	########## Supervised ##########
	# Load the pre-trained model
	model = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large')

	# Generate Embeddings
	sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
	sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

	# Cosine Similarity
	stsb_test['SimCSE Supervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)


	########## Un-Supervised ##########
	# Load the pre-trained model
	model = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large')

	# Generate Embeddings
	sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
	sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

	# Cosine Similarity
	stsb_test['SimCSE Unsupervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)