Skip to content

Instantly share code, notes, and snippets.

@Aditya1001001
Last active April 19, 2023 15:34
Show Gist options
  • Save Aditya1001001/0dcb858001998d042e453425ca46eb15 to your computer and use it in GitHub Desktop.
Save Aditya1001001/0dcb858001998d042e453425ca46eb15 to your computer and use it in GitHub Desktop.
Comparing Text Similarity Measures & Text Embedding Methods
def tagged_document(list_of_list_of_words):
for i, list_of_words in enumerate(list_of_list_of_words):
yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])
training_data = list(tagged_document(data))
model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)
model.build_vocab(training_data)
model.train(training_data, total_examples=model.corpus_count, epochs=model.epochs)
def cos_similarity(x,y):
""" return cosine similarity between two lists """
numerator = sum(a*b for a,b in zip(x,y))
denominator = squared_sum(x)*squared_sum(y)
return round(numerator/float(denominator),3)
cos_similarity(embeddings[0], embeddings[1])
# OUTPUT
0.891
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(headlines)
arr = X.toarray()
create_heatmap(cosine_similarity(arr))
nlp = spacy.load('en_core_web_md')
docs = [nlp(headline) for headline in headlines]
from simple_elmo import ElmoModel
model = ElmoModel()
model.load("/content/209.zip")
sentence = "After stealing gold from the bank vault, the bank robber was seen fishing on the river bank."
labels = [headline[:20] for headline in headlines]
def create_heatmap(similarity, cmap = "YlGnBu"):
df = pd.DataFrame(similarity)
df.columns = labels
df.index = labels
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(df, cmap=cmap)
def distance_to_similarity(distance):
return 1/exp(distance)
distance_to_similarity(distance)
# OUTPUT
0.8450570465624478
wget http://vectors.nlpl.eu/repository/20/209.zip
python -m spacy download en_core_web_md
elmo_vectors = model.get_elmo_vectors(sentence, layers="average")
print(f"Tensor shape: {elmo_vectors.shape}")
# OUTPUT
Tensor shape: (1, 92, 1024)
vault = np.sum(elmo_vectors[0][29:33], axis = 0)/4
robber = np.sum(elmo_vectors[0][45:49], axis = 0)/4
river = np.sum(elmo_vectors[0][87:91], axis = 0)/4
from math import sqrt, pow, exp
def squared_sum(x):
""" return 3 rounded square rooted value """
return round(sqrt(sum([a*a for a in x])),3)
def euclidean_distance(x,y):
""" return euclidean distance between two lists """
return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('stsb-roberta-large')
embeddings = [nlp(sentence).vector for sentence in sentences]
distance = euclidean_distance(embeddings[0], embeddings[1])
print(distance)
# OUTPUT
1.8646982721454675
import gensim
import gensim.downloader as api
dataset = api.load("text8")
data = [i for i in dataset]
import tensorflow as tf
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
headlines = [
#Crypto
'Investors unfazed by correction as crypto funds see $154 million inflows',
'Bitcoin, Ethereum prices continue descent, but crypto funds see inflows',
#Inflation
'The surge in euro area inflation during the pandemic: transitory but with upside risks',
"Inflation: why it's temporary and raising interest rates will do more harm than good",
#common
'Will Cryptocurrency Protect Against Inflation?']
pip install transformers sentence-transformers
def jaccard_similarity(x,y):
""" returns the jaccard similarity between two lists """
intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
union_cardinality = len(set.union(*[set(x), set(y)]))
return intersection_cardinality/float(union_cardinality)
vectors = [model.infer_vector([word for word in sent]).reshape(1,-1) for sent in sentences]
similarity = []
for i in range(len(sentences)):
row = []
for j in range(len(sentences)):
row.append(cosine_similarity(vectors[i],vectors[j])[0][0])
similarity.append(row)
create_heatmap(similarity)
diff_bank_1 = cosine_similarity(vault, river)
diff_bank_2 = cosine_similarity(river, robber)
same_bank = cosine_similarity(vault, robber)
print('Vector similarity for *similar* meanings: %.2f' % same_bank)
print('Vector similarity for *different* meanings: %.2f' % diff_bank_1)
print('Vector similarity for *different* meanings: %.2f' % diff_bank_2)
sentences = ["The bottle is empty",
"There is nothing in the bottle"]
sentences = [sent.lower().split(" ") for sent in sentences]
jaccard_similarity(sentences[0], sentences[1])
# OUPUT
0.42857142857142855
embeddings = model.encode(sentences, convert_to_tensor=True)
similarity = []
for i in range(len(sentences)):
row = []
for j in range(len(sentences)):
row.append(util.pytorch_cos_sim(embeddings[i], embeddings[j]).item())
similarity.append(row)
create_heatmap(similarity)
embeddings = model(text)
similarity = cosine_similarity(embeddings)
create_heatmap(similarity)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(headlines)
arr = X.toarray()
create_heatmap(cosine_similarity(arr))
similarity = []
for i in range(len(docs)):
row = []
for j in range(len(docs)):
row.append(docs[i].similarity(docs[j]))
similarity.append(row)
create_heatmap(similarity)
print(docs[0].vector)
@bitsnaps
Copy link

bitsnaps commented Apr 18, 2023

Thank you for your great article about text similarity, it becomes hard to find working examples with a lots of breaking changes in python's libraries, some of the functions can be simplified using third party libraries.

1- for the euclidean distance we could use scipy:

from scipy.spatial import distance

embeddings = [nlp(sentence).vector for sentence in sentences]
distance = distance.euclidean(embeddings[0], embeddings[1])
print(distance)

2- for the cosine_similarity we could use:

import numpy as np

def cos_similarity(x,y):
  """ return cosine similarity between two lists """
  return (np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)))

cos_similarity(embeddings[0], embeddings[1])

In test_elmo_word_vectors.py example, get the following error using colab:

----> 1 diff_bank_1 = cosine_similarity(vault, river)
      2 diff_bank_2 = cosine_similarity(river, robber)
      3 same_bank = cosine_similarity(vault, robber)
      4 
      5 print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)

2 frames
[/usr/local/lib/python3.9/dist-packages/sklearn/utils/validation.py](https://localhost:8080/#) in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    900             # If input is 1D raise error
    901             if array.ndim == 1:
--> 902                 raise ValueError(
    903                     "Expected 2D array, got 1D array instead:\narray={}.\n"
    904                     "Reshape your data either using array.reshape(-1, 1) if "

ValueError: Expected 2D array, got 1D array instead:
array=[0. 0. 0. ... 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment