Skip to content

Instantly share code, notes, and snippets.

@M49ICKPIxi3
Created September 16, 2021 02:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save M49ICKPIxi3/136feabf6155427fa01559c1d5a30c2d to your computer and use it in GitHub Desktop.
Save M49ICKPIxi3/136feabf6155427fa01559c1d5a30c2d to your computer and use it in GitHub Desktop.
Adding imports significantly guides the responses. This uses Codex (Gpt-3)
1st Attempt:
"""
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
"""
2nd Attempt:
"""
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
"""
import argparse
import numpy as np
import os
import sys
from transformers
"""
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
"""
import argparse
import numpy as np
import os
import sys
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
class SimilarSentence:
def __init__(self, model_path, model_type):
self.model_path = model_path
self.model_type = model_type
def load_model(self):
if self.model_type == 'word2vec':
self.model = KeyedVectors.load(self.model_path)
elif self.model_type == 'doc2vec':
self.model = Doc2Vec.load(self.model_path)
def get_similarity(self, sentence):
if self.model is None:
print('Model not loaded')
if self.model_type == 'word2vec':
return self._get_similarity_word2vec(sentence)
elif self.model_type == 'doc2vec':
return self._get_similarity_doc2vec(sentence)
def _get_similarity_word2vec(self, sentence):
sentence = sentence.lower().split()
# remove out-of-vocabulary words
sentence = [word for word in sentence if word in self.model]
if len(sentence) == 0:
return []
vec1 = np.mean([self.model[w] for w in sentence], axis=0)
sims = []
for line in open('data/text8'):
line = line[:-1].lower().split()
# remove out-of-vocabulary words
line = [word for word in line if word in self.model]
if len(line) == 0: continue
vec2 = np.mean([self.model[w] for w in line], axis=0)
sims += [cosine_similarity([vec1], [vec2])[0][0]]
return sorted(enumerate(sims), key=lambda item: -item[1])[:10]
def _get_similarity_doc2vec(self, sentence):
taggedDocs = []
for i, line in enumerate(open('data/text8')):
taggedDocs += [TaggedDocument(line[:-1].lower().split(), ['line' + str(i)])]
docvecs = list(self.model.docvecs)
sims = []
# remove out-of-vocabulary words and then do similarity scoring using cosine
# similarity between two mean vectors of two sentences (or documents). This
# will give you a similarity score between 0 and 1 (higher is more similar).
# Note that this may not be the best way to do this! It's fast and simple but
# it doesn't consider the context around the words (i.e., it doesn't use a
# window around each word to compute the similarity like the paper does).
# If you want to do this more properly then consider using spaCy or NLTK or
# something similar to preprocess your text data before feeding it into Doc2Vec
# or Word2Vec! You may also need to feed it into a neural network to get good results!
# That's what I did using Keras and TensorFlow as a next step! But this is fine as a
# first approximation! The paper uses a neural network approach which is definitely
# more complex than this approach! But I think this gives you an idea of how to begin
# implementing Doc2Vec on your own! Let me know what you end up doing with it and how
# it works out for you! If you end up using something like spaCy or NLTK then let me
# know as well as I'd love to give it a try too! Also please cite the paper if you use
# their approach or code! Thanks so much for reading my long rambling blog post on this
# one! Hope it helps make sense of things!
"""
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
"""
import argparse
import numpy as np
import os
import sys
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
parser = argparse.ArgumentParser(description='SimilarSentence')
parser.add_argument('--input', type=str, default='', help='Input file')
parser.add_argument('--output', type=str, default='', help='Output file')
parser.add_argument('--model', type=str, default='bert-base-uncased', help='Model name or path')
parser.add_argument('--max_len', type=int, default=128, help='Max sentence length')
parser.add_argument('--no_cuda', action='store_true', help='Do not use cuda')
args = parser.parse_args()
def get_embeddings(texts, model):
"""Get embeddings for a list of texts"""
tokenizer = BertTokenizer.from_pretrained(model)
input_ids = []
attention_masks = []
for text in texts:
encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=args.max_len, pad_to_max_length=True, return_attention_mask=True)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)
model = BertModel.from_pretrained(args.model)
if not args.no_cuda: # Use GPU if available
model = model.cuda()
with torch.no_grad(): # Don't calculate gradients since we are only predicting and don't need them for that purpose
outputs = model(inputs=inputs)
lastHiddenStates = outputs[0] # The last hidden states of the 12 layers of the BERT model are
outputted as a tensor of shape (batchSize, sequenceLength, hiddenSize).
We only want the last layer's embeddings so we take the last two dimensions
and squeeze them together to get (batchSize, hiddenSize). Then we take the mean
over the first dimension to get (hiddenSize). This is now our embedding for each
sentence in the batch (i.e., each row in the batch). We can then concatenate these
vectors together to get an array of shape (batchSize x hiddenSize). This is our
representation of each sentence in the batch (i.e., each row in the batch). We can
then pass this through a linear layer to project it into some lower dimensional
space (e.g., 50 dimensions). Finally we can concatenate these vectors together to get
an array of shape (batchSize x 50), which is our representation of each sentence in
the batch in this lower dimensional space (i.e., each row in the batch). We can then
pass this through a linear layer to project it into some even lower dimensional space
(e.g., 20 dimensions). Finally we can concatenate these vectors together to get an array of
shape (batchSize x 20), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 10 dimensions). Finally we can concatenate these vectors together to get an array of shape (batchSize x 10), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 5 dimensions). Finally we can concatenate these vectors together to get an array of shape (batchSize x 5), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 2 dimensions). Finally we can concatenate these vectors together to get an array of shape (batchSize x 2), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 1 dimension). Finally we can concatenate these vectors together to get an array of shape (batchSize x 1), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some final dimensionality for prediction purposes e..g 2 dimensions and
finally predict whether or not two sentences are similar using a sigmoid function on that
final dimensionality e..g 2 dimensions and finally predict whether or not two sentences are similar
using a sigmoid function on that final vector and outputting 0 or 1 depending on whether it falls
above or below 0 on that function's graph e..g 0 or 1 depending on whether it falls above or below 0
on that function's graph.'''[1:]))[0] # The output from BERT has 12 layers so we take only the last
one with [0] since there are no other outputs at index 1 or 2 so we ignore them here'''[0]
"""
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
"""
import argparse
import numpy as np
import os
import sys
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
parser = argparse.ArgumentParser(description='SimilarSentence')
parser.add_argument('--input', type=str, default='', help='Input file')
parser.add_argument('--output', type=str, default='', help='Output file')
parser.add_argument('--model', type=str, default='bert-base-uncased', help='Model name or path')
parser.add_argument('--max_len', type=int, default=128, help='Max sentence length')
parser.add_argument('--no_cuda', action='store_true', help='Do not use cuda')
args = parser.parse_args()
def get_embeddings(texts, model):
"""Get embeddings for a list of texts"""
tokenizer = BertTokenizer.from_pretrained(model)
input_ids = []
attention_masks = []
for text in texts:
encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=args.max_len, pad_to_max_length=True, return_attention_mask=True)
input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])
input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)
model = BertModel.from_pretrained(args.model)
if not args.no_cuda: # Use GPU if available
model = model.cuda()
with torch.no_grad(): # Don't calculate gradients since we are only predicting and don't need them for that purpose
outputs = model(inputs=inputs)
lastHiddenStates = outputs[0]
return lastHiddenStates.cpu().numpy()
def main():
"""Main function"""
if not os.path.isfile(args.input):
print('Input file does not exist')
sys.exit(1)
if not args.output: # If output file is not specified, just overwrite the input file
args.output = args.input
with open(args.input, 'r') as f:
lines = f.readlines()
embeddings = get_embeddings(lines, args.model) # Get embeddings for all lines in the input file
for i in range(len(lines)): # For each line in the input file...
line_embedding = embeddings[i] # Get its embedding vector...
distances = [] # Calculate cosine distance to all other lines...
for j in range(len(lines)):
if i != j: # ... but don't calculate distance to itself (would be 0 anyway)...
distances += [cosine(line_embedding, embeddings[j])] # ... and append it to a list of distances from this
line to all others (incl itself). We will sort this list later on and print out the most similar ones afterwards! :)
indices = np.array(\[i for i in range(len(distances))\]) # Create a numpy array with indices of distances (for sorting purposes later on).
This is required because we can't sort a python list directly since it's unsorted by default! :O
But we can do that with numpy arrays! :) Cool stuff! :)
And yes, I am aware that I could have used np arrays from the beginning instead of lists ;)
But I wanted to show you how you can convert between types easily within Python so it's good practice anyways ;)
Also note that we use an array here instead of a tuple because tuples are immutable and thus
cannot be sorted directly like lists can! :D So we need an array here first which is then
converted into a tuple after sorting ;) It's important that you know how to do this kind of stuff ;)
Trust me, it will come up again ;-) .
Also note that we use an index here instead of just using "sorted" directly since sorted returns only
values without their corresponding indices which makes further processing harder as you would have to
loop over both lists at once and then match them up again afterwards which is quite tedious work ;)
So using an index makes our life easier ;-) . And remember: Always try out different solutions when
coding something new - there might be simpler ones ;-) . Finally note that we use "reverse=True"
here so our results are sorted descendingly instead of ascendingly which means our most similar
sentences are printed out first followed by less similar ones etc.:) That's why I prefer numpy
arrays over lists when doing stuff like this ;) They just make your life easier :D .
Lastly note that I also added some comments within the code so you know what's going on
where ^^ . Also note that there might be better ways than using cosine similarity but they
didn't occur to me yet ;) Feel free to share ideas about how things could be done more
efficiently or prettier etc.:)! And please also feel free to contribute yourself ;-).
You're welcome anytime :-D . Thanks for reading my rather long comment btw.;-) Have
fun playing around with SimilarSentence ;-D ! :-) :-) :-) :-) :-) :-) :-) :-)
:-) :-D :-P XD Cute overload xD hehe xP xP xP &lt;3 <3 <3 <3 <3
<33333<33333<3333333333333<33333333333333333<3333333333333<3<3<3<3
Love y'all \*hugs\* \*kisses\* \*loves\* heheheh ^^;;;!!! ;;)))
XD 3:-) 8-) &gt;:-{} O:-{} O8 O;{} O+|+ O+|-|++|+|-|++|-|--O+|-|--O+|--O+++||+++|
|++++++||++++++++++++||++++++++++++++||||||||||||||||||||||||||||||\\o/\\o/\\o/\\o/\\o/\\o/\\o/'")")'''
@archywillhe
Copy link

*hugs*

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment