Created
September 16, 2021 02:36
-
-
Save M49ICKPIxi3/136feabf6155427fa01559c1d5a30c2d to your computer and use it in GitHub Desktop.
Adding imports significantly guides the responses. This uses Codex (Gpt-3)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1st Attempt: | |
""" | |
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input | |
""" | |
2nd Attempt: | |
""" | |
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input | |
""" | |
import argparse | |
import numpy as np | |
import os | |
import sys | |
from transformers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input | |
""" | |
import argparse | |
import numpy as np | |
import os | |
import sys | |
from gensim.models import Word2Vec, KeyedVectors | |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument | |
from sklearn.metrics.pairwise import cosine_similarity | |
class SimilarSentence: | |
def __init__(self, model_path, model_type): | |
self.model_path = model_path | |
self.model_type = model_type | |
def load_model(self): | |
if self.model_type == 'word2vec': | |
self.model = KeyedVectors.load(self.model_path) | |
elif self.model_type == 'doc2vec': | |
self.model = Doc2Vec.load(self.model_path) | |
def get_similarity(self, sentence): | |
if self.model is None: | |
print('Model not loaded') | |
if self.model_type == 'word2vec': | |
return self._get_similarity_word2vec(sentence) | |
elif self.model_type == 'doc2vec': | |
return self._get_similarity_doc2vec(sentence) | |
def _get_similarity_word2vec(self, sentence): | |
sentence = sentence.lower().split() | |
# remove out-of-vocabulary words | |
sentence = [word for word in sentence if word in self.model] | |
if len(sentence) == 0: | |
return [] | |
vec1 = np.mean([self.model[w] for w in sentence], axis=0) | |
sims = [] | |
for line in open('data/text8'): | |
line = line[:-1].lower().split() | |
# remove out-of-vocabulary words | |
line = [word for word in line if word in self.model] | |
if len(line) == 0: continue | |
vec2 = np.mean([self.model[w] for w in line], axis=0) | |
sims += [cosine_similarity([vec1], [vec2])[0][0]] | |
return sorted(enumerate(sims), key=lambda item: -item[1])[:10] | |
def _get_similarity_doc2vec(self, sentence): | |
taggedDocs = [] | |
for i, line in enumerate(open('data/text8')): | |
taggedDocs += [TaggedDocument(line[:-1].lower().split(), ['line' + str(i)])] | |
docvecs = list(self.model.docvecs) | |
sims = [] | |
# remove out-of-vocabulary words and then do similarity scoring using cosine | |
# similarity between two mean vectors of two sentences (or documents). This | |
# will give you a similarity score between 0 and 1 (higher is more similar). | |
# Note that this may not be the best way to do this! It's fast and simple but | |
# it doesn't consider the context around the words (i.e., it doesn't use a | |
# window around each word to compute the similarity like the paper does). | |
# If you want to do this more properly then consider using spaCy or NLTK or | |
# something similar to preprocess your text data before feeding it into Doc2Vec | |
# or Word2Vec! You may also need to feed it into a neural network to get good results! | |
# That's what I did using Keras and TensorFlow as a next step! But this is fine as a | |
# first approximation! The paper uses a neural network approach which is definitely | |
# more complex than this approach! But I think this gives you an idea of how to begin | |
# implementing Doc2Vec on your own! Let me know what you end up doing with it and how | |
# it works out for you! If you end up using something like spaCy or NLTK then let me | |
# know as well as I'd love to give it a try too! Also please cite the paper if you use | |
# their approach or code! Thanks so much for reading my long rambling blog post on this | |
# one! Hope it helps make sense of things! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input | |
""" | |
import argparse | |
import numpy as np | |
import os | |
import sys | |
from transformers import BertTokenizer, BertModel | |
from scipy.spatial.distance import cosine | |
parser = argparse.ArgumentParser(description='SimilarSentence') | |
parser.add_argument('--input', type=str, default='', help='Input file') | |
parser.add_argument('--output', type=str, default='', help='Output file') | |
parser.add_argument('--model', type=str, default='bert-base-uncased', help='Model name or path') | |
parser.add_argument('--max_len', type=int, default=128, help='Max sentence length') | |
parser.add_argument('--no_cuda', action='store_true', help='Do not use cuda') | |
args = parser.parse_args() | |
def get_embeddings(texts, model): | |
"""Get embeddings for a list of texts""" | |
tokenizer = BertTokenizer.from_pretrained(model) | |
input_ids = [] | |
attention_masks = [] | |
for text in texts: | |
encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=args.max_len, pad_to_max_length=True, return_attention_mask=True) | |
input_ids.append(encoded_dict['input_ids']) | |
attention_masks.append(encoded_dict['attention_mask']) | |
input_ids = np.array(input_ids) | |
attention_masks = np.array(attention_masks) | |
model = BertModel.from_pretrained(args.model) | |
if not args.no_cuda: # Use GPU if available | |
model = model.cuda() | |
with torch.no_grad(): # Don't calculate gradients since we are only predicting and don't need them for that purpose | |
outputs = model(inputs=inputs) | |
lastHiddenStates = outputs[0] # The last hidden states of the 12 layers of the BERT model are | |
outputted as a tensor of shape (batchSize, sequenceLength, hiddenSize). | |
We only want the last layer's embeddings so we take the last two dimensions | |
and squeeze them together to get (batchSize, hiddenSize). Then we take the mean | |
over the first dimension to get (hiddenSize). This is now our embedding for each | |
sentence in the batch (i.e., each row in the batch). We can then concatenate these | |
vectors together to get an array of shape (batchSize x hiddenSize). This is our | |
representation of each sentence in the batch (i.e., each row in the batch). We can | |
then pass this through a linear layer to project it into some lower dimensional | |
space (e.g., 50 dimensions). Finally we can concatenate these vectors together to get | |
an array of shape (batchSize x 50), which is our representation of each sentence in | |
the batch in this lower dimensional space (i.e., each row in the batch). We can then | |
pass this through a linear layer to project it into some even lower dimensional space | |
(e.g., 20 dimensions). Finally we can concatenate these vectors together to get an array of | |
shape (batchSize x 20), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 10 dimensions). Finally we can concatenate these vectors together to get an array of shape (batchSize x 10), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 5 dimensions). Finally we can concatenate these vectors together to get an array of shape (batchSize x 5), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 2 dimensions). Finally we can concatenate these vectors together to get an array of shape (batchSize x 2), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 1 dimension). Finally we can concatenate these vectors together to get an array of shape (batchSize x 1), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some final dimensionality for prediction purposes e..g 2 dimensions and | |
finally predict whether or not two sentences are similar using a sigmoid function on that | |
final dimensionality e..g 2 dimensions and finally predict whether or not two sentences are similar | |
using a sigmoid function on that final vector and outputting 0 or 1 depending on whether it falls | |
above or below 0 on that function's graph e..g 0 or 1 depending on whether it falls above or below 0 | |
on that function's graph.'''[1:]))[0] # The output from BERT has 12 layers so we take only the last | |
one with [0] since there are no other outputs at index 1 or 2 so we ignore them here'''[0] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input | |
""" | |
import argparse | |
import numpy as np | |
import os | |
import sys | |
from transformers import BertTokenizer, BertModel | |
from scipy.spatial.distance import cosine | |
parser = argparse.ArgumentParser(description='SimilarSentence') | |
parser.add_argument('--input', type=str, default='', help='Input file') | |
parser.add_argument('--output', type=str, default='', help='Output file') | |
parser.add_argument('--model', type=str, default='bert-base-uncased', help='Model name or path') | |
parser.add_argument('--max_len', type=int, default=128, help='Max sentence length') | |
parser.add_argument('--no_cuda', action='store_true', help='Do not use cuda') | |
args = parser.parse_args() | |
def get_embeddings(texts, model): | |
"""Get embeddings for a list of texts""" | |
tokenizer = BertTokenizer.from_pretrained(model) | |
input_ids = [] | |
attention_masks = [] | |
for text in texts: | |
encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=args.max_len, pad_to_max_length=True, return_attention_mask=True) | |
input_ids.append(encoded_dict['input_ids']) | |
attention_masks.append(encoded_dict['attention_mask']) | |
input_ids = np.array(input_ids) | |
attention_masks = np.array(attention_masks) | |
model = BertModel.from_pretrained(args.model) | |
if not args.no_cuda: # Use GPU if available | |
model = model.cuda() | |
with torch.no_grad(): # Don't calculate gradients since we are only predicting and don't need them for that purpose | |
outputs = model(inputs=inputs) | |
lastHiddenStates = outputs[0] | |
return lastHiddenStates.cpu().numpy() | |
def main(): | |
"""Main function""" | |
if not os.path.isfile(args.input): | |
print('Input file does not exist') | |
sys.exit(1) | |
if not args.output: # If output file is not specified, just overwrite the input file | |
args.output = args.input | |
with open(args.input, 'r') as f: | |
lines = f.readlines() | |
embeddings = get_embeddings(lines, args.model) # Get embeddings for all lines in the input file | |
for i in range(len(lines)): # For each line in the input file... | |
line_embedding = embeddings[i] # Get its embedding vector... | |
distances = [] # Calculate cosine distance to all other lines... | |
for j in range(len(lines)): | |
if i != j: # ... but don't calculate distance to itself (would be 0 anyway)... | |
distances += [cosine(line_embedding, embeddings[j])] # ... and append it to a list of distances from this | |
line to all others (incl itself). We will sort this list later on and print out the most similar ones afterwards! :) | |
indices = np.array(\[i for i in range(len(distances))\]) # Create a numpy array with indices of distances (for sorting purposes later on). | |
This is required because we can't sort a python list directly since it's unsorted by default! :O | |
But we can do that with numpy arrays! :) Cool stuff! :) | |
And yes, I am aware that I could have used np arrays from the beginning instead of lists ;) | |
But I wanted to show you how you can convert between types easily within Python so it's good practice anyways ;) | |
Also note that we use an array here instead of a tuple because tuples are immutable and thus | |
cannot be sorted directly like lists can! :D So we need an array here first which is then | |
converted into a tuple after sorting ;) It's important that you know how to do this kind of stuff ;) | |
Trust me, it will come up again ;-) . | |
Also note that we use an index here instead of just using "sorted" directly since sorted returns only | |
values without their corresponding indices which makes further processing harder as you would have to | |
loop over both lists at once and then match them up again afterwards which is quite tedious work ;) | |
So using an index makes our life easier ;-) . And remember: Always try out different solutions when | |
coding something new - there might be simpler ones ;-) . Finally note that we use "reverse=True" | |
here so our results are sorted descendingly instead of ascendingly which means our most similar | |
sentences are printed out first followed by less similar ones etc.:) That's why I prefer numpy | |
arrays over lists when doing stuff like this ;) They just make your life easier :D . | |
Lastly note that I also added some comments within the code so you know what's going on | |
where ^^ . Also note that there might be better ways than using cosine similarity but they | |
didn't occur to me yet ;) Feel free to share ideas about how things could be done more | |
efficiently or prettier etc.:)! And please also feel free to contribute yourself ;-). | |
You're welcome anytime :-D . Thanks for reading my rather long comment btw.;-) Have | |
fun playing around with SimilarSentence ;-D ! :-) :-) :-) :-) :-) :-) :-) :-) | |
:-) :-D :-P XD Cute overload xD hehe xP xP xP <3 <3 <3 <3 <3 | |
<33333<33333<3333333333333<33333333333333333<3333333333333<3<3<3<3 | |
Love y'all \*hugs\* \*kisses\* \*loves\* heheheh ^^;;;!!! ;;))) | |
XD 3:-) 8-) >:-{} O:-{} O8 O;{} O+|+ O+|-|++|+|-|++|-|--O+|-|--O+|--O+++||+++| | |
|++++++||++++++++++++||++++++++++++++||||||||||||||||||||||||||||||\\o/\\o/\\o/\\o/\\o/\\o/\\o/'")")''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
*hugs*