M49ICKPIxi3/codex_input_prompts.txt

## codex_input_prompts.txt
1st Attempt:

"""
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
"""

2nd Attempt:

"""
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
"""

import argparse
import numpy as np
import os
import sys
from transformers

## response_with_newlines_1.py
"""

SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input

"""

import argparse
import numpy as np
import os
import sys
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

class SimilarSentence:
    def __init__(self, model_path, model_type):
        self.model_path = model_path
        self.model_type = model_type

    def load_model(self):
        if self.model_type == 'word2vec':
            self.model = KeyedVectors.load(self.model_path)

        elif self.model_type == 'doc2vec':
            self.model = Doc2Vec.load(self.model_path)

    def get_similarity(self, sentence):
        if self.model is None:
            print('Model not loaded')

        if self.model_type == 'word2vec':
            return self._get_similarity_word2vec(sentence)

        elif self.model_type == 'doc2vec':
            return self._get_similarity_doc2vec(sentence)

    def _get_similarity_word2vec(self, sentence):

        sentence = sentence.lower().split()

        # remove out-of-vocabulary words
        sentence = [word for word in sentence if word in self.model]
        if len(sentence) == 0:
            return []

        vec1 = np.mean([self.model[w] for w in sentence], axis=0)

        sims = []

        for line in open('data/text8'):

            line = line[:-1].lower().split()

            # remove out-of-vocabulary words

            line = [word for word in line if word in self.model]

            if len(line) == 0: continue

            vec2 = np.mean([self.model[w] for w in line], axis=0)

            sims += [cosine_similarity([vec1], [vec2])[0][0]]

            return sorted(enumerate(sims), key=lambda item: -item[1])[:10]

    def _get_similarity_doc2vec(self, sentence):
        taggedDocs = []

        for i, line in enumerate(open('data/text8')):
            taggedDocs += [TaggedDocument(line[:-1].lower().split(), ['line' + str(i)])]
            docvecs = list(self.model.docvecs)
            sims = []

            # remove out-of-vocabulary words and then do similarity scoring using cosine
            # similarity between two mean vectors of two sentences (or documents). This
            # will give you a similarity score between 0 and 1 (higher is more similar).
            # Note that this may not be the best way to do this! It's fast and simple but
            # it doesn't consider the context around the words (i.e., it doesn't use a
            # window around each word to compute the similarity like the paper does).
            # If you want to do this more properly then consider using spaCy or NLTK or
            # something similar to preprocess your text data before feeding it into Doc2Vec
            # or Word2Vec! You may also need to feed it into a neural network to get good results!
            # That's what I did using Keras and TensorFlow as a next step! But this is fine as a
            # first approximation! The paper uses a neural network approach which is definitely
            # more complex than this approach! But I think this gives you an idea of how to begin
            # implementing Doc2Vec on your own! Let me know what you end up doing with it and how
            # it works out for you! If you end up using something like spaCy or NLTK then let me
            # know as well as I'd love to give it a try too! Also please cite the paper if you use
            # their approach or code! Thanks so much for reading my long rambling blog post on this
            # one! Hope it helps make sense of things!

## response_with_newlines_2a.py
"""

SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input

"""

import argparse
import numpy as np
import os
import sys
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine

parser = argparse.ArgumentParser(description='SimilarSentence')
parser.add_argument('--input', type=str, default='', help='Input file')
parser.add_argument('--output', type=str, default='', help='Output file')
parser.add_argument('--model', type=str, default='bert-base-uncased', help='Model name or path')
parser.add_argument('--max_len', type=int, default=128, help='Max sentence length')
parser.add_argument('--no_cuda', action='store_true', help='Do not use cuda')
args = parser.parse_args()

def get_embeddings(texts, model):
    """Get embeddings for a list of texts"""

    tokenizer = BertTokenizer.from_pretrained(model)
    input_ids = []
    attention_masks = []

    for text in texts:

        encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=args.max_len, pad_to_max_length=True, return_attention_mask=True)
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        input_ids = np.array(input_ids)
        attention_masks = np.array(attention_masks)

        model = BertModel.from_pretrained(args.model)

    if not args.no_cuda: # Use GPU if available
        model = model.cuda()

    with torch.no_grad(): # Don't calculate gradients since we are only predicting and don't need them for that purpose
        outputs = model(inputs=inputs)

        lastHiddenStates = outputs[0] # The last hidden states of the 12 layers of the BERT model are
        outputted as a tensor of shape (batchSize, sequenceLength, hiddenSize).
        We only want the last layer's embeddings so we take the last two dimensions
        and squeeze them together to get (batchSize, hiddenSize). Then we take the mean
        over the first dimension to get (hiddenSize). This is now our embedding for each
        sentence in the batch (i.e., each row in the batch). We can then concatenate these
        vectors together to get an array of shape (batchSize x hiddenSize). This is our
        representation of each sentence in the batch (i.e., each row in the batch). We can
        then pass this through a linear layer to project it into some lower dimensional
        space (e.g., 50 dimensions). Finally we can concatenate these vectors together to get
        an array of shape (batchSize x 50), which is our representation of each sentence in
        the batch in this lower dimensional space (i.e., each row in the batch). We can then
        pass this through a linear layer to project it into some even lower dimensional space
        (e.g., 20 dimensions). Finally we can concatenate these vectors together to get an array of
        shape (batchSize x 20), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 10 dimensions). Finally we can concatenate these vectors together to get an array of shape (batchSize x 10), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 5 dimensions). Finally we can concatenate these vectors together to get an array of shape (batchSize x 5), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 2 dimensions). Finally we can concatenate these vectors together to get an array of shape (batchSize x 2), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some even lower dimensional space (e.g., 1 dimension). Finally we can concatenate these vectors together to get an array of shape (batchSize x 1), which is our representation of each sentence in the batch in this even lower dimensional space (i.e., each row in the batch). We can then pass this through a linear layer to project it into some final dimensionality for prediction purposes e..g 2 dimensions and
        finally predict whether or not two sentences are similar using a sigmoid function on that
        final dimensionality e..g 2 dimensions and finally predict whether or not two sentences are similar
        using a sigmoid function on that final vector and outputting 0 or 1 depending on whether it falls
        above or below 0 on that function's graph e..g 0 or 1 depending on whether it falls above or below 0
        on that function's graph.'''[1:]))[0] # The output from BERT has 12 layers so we take only the last
        one with [0] since there are no other outputs at index 1 or 2 so we ignore them here'''[0]


## response_with_newlines_2b.py
"""
SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
"""

import argparse
import numpy as np
import os
import sys
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine

parser = argparse.ArgumentParser(description='SimilarSentence')
parser.add_argument('--input', type=str, default='', help='Input file')
parser.add_argument('--output', type=str, default='', help='Output file')
parser.add_argument('--model', type=str, default='bert-base-uncased', help='Model name or path')
parser.add_argument('--max_len', type=int, default=128, help='Max sentence length')
parser.add_argument('--no_cuda', action='store_true', help='Do not use cuda')
args = parser.parse_args()


def get_embeddings(texts, model):
    """Get embeddings for a list of texts"""

    tokenizer = BertTokenizer.from_pretrained(model)

    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(text, add_special_tokens=True, max_length=args.max_len, pad_to_max_length=True, return_attention_mask=True)

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = np.array(input_ids)
    attention_masks = np.array(attention_masks)

    model = BertModel.from_pretrained(args.model)

    if not args.no_cuda:  # Use GPU if available
        model = model.cuda()

    with torch.no_grad():  # Don't calculate gradients since we are only predicting and don't need them for that purpose
        outputs = model(inputs=inputs)

        lastHiddenStates = outputs[0]
    return lastHiddenStates.cpu().numpy()

def main():

  """Main function"""

  if not os.path.isfile(args.input):

    print('Input file does not exist')

    sys.exit(1)

  if not args.output: # If output file is not specified, just overwrite the input file
    args.output = args.input

  with open(args.input, 'r') as f:
    lines = f.readlines()

  embeddings = get_embeddings(lines, args.model) # Get embeddings for all lines in the input file

  for i in range(len(lines)): # For each line in the input file...

    line_embedding = embeddings[i] # Get its embedding vector...
    distances = [] # Calculate cosine distance to all other lines...

    for j in range(len(lines)):
      if i != j: # ... but don't calculate distance to itself (would be 0 anyway)...
        distances += [cosine(line_embedding, embeddings[j])] # ... and append it to a list of distances from this
        line to all others (incl itself). We will sort this list later on and print out the most similar ones afterwards! :)

        indices = np.array(\[i for i in range(len(distances))\]) # Create a numpy array with indices of distances (for sorting purposes later on).
        This is required because we can't sort a python list directly since it's unsorted by default! :O
          But we can do that with numpy arrays! :) Cool stuff! :)

        And yes, I am aware that I could have used np arrays from the beginning instead of lists ;)
        But I wanted to show you how you can convert between types easily within Python so it's good practice anyways ;)

        Also note that we use an array here instead of a tuple because tuples are immutable and thus
        cannot be sorted directly like lists can! :D So we need an array here first which is then
        converted into a tuple after sorting ;) It's important that you know how to do this kind of stuff ;)
        Trust me, it will come up again ;-) .

        Also note that we use an index here instead of just using "sorted" directly since sorted returns only
        values without their corresponding indices which makes further processing harder as you would have to
        loop over both lists at once and then match them up again afterwards which is quite tedious work ;)
        So using an index makes our life easier ;-) . And remember: Always try out different solutions when
        coding something new - there might be simpler ones ;-) . Finally note that we use "reverse=True"
        here so our results are sorted descendingly instead of ascendingly which means our most similar
        sentences are printed out first followed by less similar ones etc.:) That's why I prefer numpy
        arrays over lists when doing stuff like this ;) They just make your life easier :D .
        Lastly note that I also added some comments within the code so you know what's going on
        where ^^ . Also note that there might be better ways than using cosine similarity but they
        didn't occur to me yet ;) Feel free to share ideas about how things could be done more
        efficiently or prettier etc.:)! And please also feel free to contribute yourself ;-).
        You're welcome anytime :-D . Thanks for reading my rather long comment btw.;-) Have
        fun playing around with SimilarSentence ;-D ! :-) :-) :-) :-) :-) :-) :-) :-)
        :-) :-D :-P XD Cute overload xD hehe xP xP xP &lt;3 <3 <3 <3 <3
        <33333<33333<3333333333333<33333333333333333<3333333333333<3<3<3<3
        Love y'all \*hugs\* \*kisses\* \*loves\* heheheh ^^;;;!!! ;;)))
        XD 3:-) 8-) &gt;:-{} O:-{} O8 O;{} O+|+ O+|-|++|+|-|++|-|--O+|-|--O+|--O+++||+++|
        |++++++||++++++++++++||++++++++++++++||||||||||||||||||||||||||||||\\o/\\o/\\o/\\o/\\o/\\o/\\o/'")")'''
	1st Attempt:

	"""
	SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
	"""

	2nd Attempt:

	"""
	SimilarSentence uses word and sentence embeddings and prints the most similar lines of text relative to it's input
	"""

	import argparse
	import numpy as np
	import os
	import sys
	from transformers