Adding imports significantly guides the responses. This uses Codex (Gpt-3)
1st Attempt:
2nd Attempt:
import argparse
import numpy as np
import os
import sys
from transformers
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
import argparse
import numpy as np
import os
import sys
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
class SimilarSentence:
def __init__(self, model_path, model_type):
self.model_path = model_path
self.model_type = model_type
def load_model(self):
if self.model_type == 'word2vec':
self.model = KeyedVectors.load(self.model_path)
elif self.model_type == 'doc2vec':
self.model = Doc2Vec.load(self.model_path)
def get_similarity(self, sentence):
if self.model is None:
print('Model not loaded')
if self.model_type == 'word2vec':
return self._get_similarity_word2vec(sentence)
elif self.model_type == 'doc2vec':
return self._get_similarity_doc2vec(sentence)
def _get_similarity_word2vec(self, sentence):
sentence = sentence.lower().split()
# remove out-of-vocabulary words
sentence = [word for word in sentence if word in self.model]
if len(sentence) == 0:
return []
vec1 = np.mean([self.model[w] for w in sentence], axis=0)
sims = []
for line in open('data/text8'):
line = line[:-1].lower().split()
# remove out-of-vocabulary words
line = [word for word in line if word in self.model]
if len(line) == 0: continue
vec2 = np.mean([self.model[w] for w in line], axis=0)
sims += [cosine_similarity([vec1], [vec2])[0][0]]
return sorted(enumerate(sims), key=lambda item: -item[1])[:10]
def _get_similarity_doc2vec(self, sentence):
taggedDocs = []
for i, line in enumerate(open('data/text8')):
taggedDocs += [TaggedDocument(line[:-1].lower().split(), ['line' + str(i)])]
docvecs = list(self.model.docvecs)
sims = []
import argparse
import numpy as np
import os
import sys
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
parser = argparse.ArgumentParser(description='SimilarSentence')
parser.add_argument('--input', type=str, default='', help='Input file')
parser.add_argument('--output', type=str, default='', help='Output file')
parser.add_argument('--model', type=str, default='bert-base-uncased', help='Model name or path')
parser.add_argument('--max_len', type=int, default=128, help='Max sentence length')
parser.add_argument('--no_cuda', action='store_true', help='Do not use cuda')
args = parser.parse_args()
def get_embeddings(texts, model):
"""Get embeddings for a list of texts"""
tokenizer = BertTokenizer.from_pretrained(model)
input_ids = []
attention_masks = []
for text in texts:
encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=args.max_len, pad_to_max_length=True, return_attention_mask=True)
input_ids = np.array(input_ids)
attention_masks = np.array(attention_masks)
model = BertModel.from_pretrained(args.model)
if not args.no_cuda: # Use GPU if available
model = model.cuda()
with torch.no_grad(): # Don't calculate gradients since we are only predicting and don't need them for that purpose
outputs = model(inputs=inputs)
lastHiddenStates = outputs[0]
return lastHiddenStates.cpu().numpy()
def main():
"""Main function"""
if not os.path.isfile(args.input):
print('Input file does not exist')
if not args.output: # If output file is not specified, just overwrite the input file
args.output = args.input
with open(args.input, 'r') as f:
lines = f.readlines()
embeddings = get_embeddings(lines, args.model) # Get embeddings for all lines in the input file
for i in range(len(lines)): # For each line in the input file...
line_embedding = embeddings[i] # Get its embedding vector...
distances = [] # Calculate cosine distance to all other lines...
for j in range(len(lines)):
if i != j: # ... but don't calculate distance to itself (would be 0 anyway)...
distances += [cosine(line_embedding, embeddings[j])]
line to all others (incl itself). We will sort this list later on and print out the most similar ones afterwards!
