r-sajal/Answer_Retrieval_TS-SS_similarity.py

## 148 changes: 148 additions & 0 deletions Answer_Retrieval_TS-SS_similarity.py
@@ -0,0 +1,148 @@

    # importing the librtaries
# importing the librtaries

    import math
import math

    import numpy as np
import numpy as np

    import nltk
import nltk

    import re
import re

    import gensim
import gensim

    from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import remove_stopwords

    from gensim import corpora
from gensim import corpora

    from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

    import heapq
import heapq


    # text from wikipedia about Elon Musk
# text from wikipedia about Elon Musk

    txt = "Elon Reeve Musk FRS (/ˈiːlɒn/ EE-lon; born June 28, 1971) is an entrepreneur and business magnate. He is the founder, CEO, and Chief Engineer at SpaceX; early stage investor,[note 1] CEO, and Product Architect of Tesla, Inc.; founder of The Boring Company; and co-founder of Neuralink and OpenAI. A centibillionaire, Musk is one of the richest people in the world.Musk was born to a Canadian mother and South African father and raised in Pretoria, South Africa. He briefly attended the University of Pretoria before moving to Canada aged 17 to attend Queen's University. He transferred to the University of Pennsylvania two years later, where he received bachelor's degrees in economics and physics. He moved to California in 1995 to attend Stanford University but decided instead to pursue a business career, co-founding the web software company Zip2 with his brother Kimbal. The startup was acquired by Compaq for $307 million in 1999. Musk co-founded online bank X.com that same year, which merged with Confinity in 2000 to form PayPal. The company was bought by eBay in 2002 for $1.5 billion.In 2002, Musk founded SpaceX, an aerospace manufacturer and space transport services company, of which he is CEO and CTO. In 2004, he joined electric vehicle manufacturer Tesla Motors, Inc. (now Tesla, Inc.) as chairman and product architect, becoming its CEO in 2008. In 2006, he helped create SolarCity, a solar energy services company that was later acquired by Tesla and became Tesla Energy. In 2015, he co-founded OpenAI, a nonprofit research company that promotes friendly artificial intelligence. In 2016, he co-founded Neuralink, a neurotechnology company focused on developing brain–computer interfaces, and founded The Boring Company, a tunnel construction company. Musk has proposed the Hyperloop, a high-speed vactrain transportation system.Musk has been the subject of criticism due to unorthodox or unscientific stances and highly publicized controversies. In 2018, he was sued for defamation by a diver who advised in the Tham Luang cave rescue; a California jury ruled in favor of Musk. In the same year, he was sued by the US Securities and Exchange Commission (SEC) for falsely tweeting that he had secured funding for a private takeover of Tesla. He settled with the SEC, temporarily stepping down from his chairmanship and accepting limitations on his Twitter usage. Musk has spread misinformation about the COVID-19 pandemic and has received criticism from experts for his other views on such matters as artificial intelligence and public transport."
txt = "Elon Reeve Musk FRS (/ˈiːlɒn/ EE-lon; born June 28, 1971) is an entrepreneur and business magnate. He is the founder, CEO, and Chief Engineer at SpaceX; early stage investor,[note 1] CEO, and Product Architect of Tesla, Inc.; founder of The Boring Company; and co-founder of Neuralink and OpenAI. A centibillionaire, Musk is one of the richest people in the world.Musk was born to a Canadian mother and South African father and raised in Pretoria, South Africa. He briefly attended the University of Pretoria before moving to Canada aged 17 to attend Queen's University. He transferred to the University of Pennsylvania two years later, where he received bachelor's degrees in economics and physics. He moved to California in 1995 to attend Stanford University but decided instead to pursue a business career, co-founding the web software company Zip2 with his brother Kimbal. The startup was acquired by Compaq for $307 million in 1999. Musk co-founded online bank X.com that same year, which merged with Confinity in 2000 to form PayPal. The company was bought by eBay in 2002 for $1.5 billion.In 2002, Musk founded SpaceX, an aerospace manufacturer and space transport services company, of which he is CEO and CTO. In 2004, he joined electric vehicle manufacturer Tesla Motors, Inc. (now Tesla, Inc.) as chairman and product architect, becoming its CEO in 2008. In 2006, he helped create SolarCity, a solar energy services company that was later acquired by Tesla and became Tesla Energy. In 2015, he co-founded OpenAI, a nonprofit research company that promotes friendly artificial intelligence. In 2016, he co-founded Neuralink, a neurotechnology company focused on developing brain–computer interfaces, and founded The Boring Company, a tunnel construction company. Musk has proposed the Hyperloop, a high-speed vactrain transportation system.Musk has been the subject of criticism due to unorthodox or unscientific stances and highly publicized controversies. In 2018, he was sued for defamation by a diver who advised in the Tham Luang cave rescue; a California jury ruled in favor of Musk. In the same year, he was sued by the US Securities and Exchange Commission (SEC) for falsely tweeting that he had secured funding for a private takeover of Tesla. He settled with the SEC, temporarily stepping down from his chairmanship and accepting limitations on his Twitter usage. Musk has spread misinformation about the COVID-19 pandemic and has received criticism from experts for his other views on such matters as artificial intelligence and public transport."


    #class for preprocessing and creating word embedding
#class for preprocessing and creating word embedding

    class Preprocessing:
class Preprocessing:

        #constructor
    #constructor

        def __init__(self,txt):
    def __init__(self,txt):

            # Tokenization
        # Tokenization

            nltk.download('punkt')  #punkt is nltk tokenizer
        nltk.download('punkt')  #punkt is nltk tokenizer

            # breaking text to sentences
        # breaking text to sentences

            tokens = nltk.sent_tokenize(txt)
        tokens = nltk.sent_tokenize(txt)

            self.tokens = tokens
        self.tokens = tokens

            self.tfidfvectoriser=TfidfVectorizer()
        self.tfidfvectoriser=TfidfVectorizer()


        # Data Cleaning
    # Data Cleaning

        # remove extra spaces
    # remove extra spaces

        # convert sentences to lower case
    # convert sentences to lower case

        # remove stopword
    # remove stopword

        def clean_sentence(self, sentence, stopwords=False):
    def clean_sentence(self, sentence, stopwords=False):

            sentence = sentence.lower().strip()
        sentence = sentence.lower().strip()

            sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
        sentence = re.sub(r'[^a-z0-9\s]', '', sentence)

            if stopwords:
        if stopwords:

                sentence = remove_stopwords(sentence)
            sentence = remove_stopwords(sentence)

            return sentence
        return sentence


        # store cleaned sentences to cleaned_sentences
    # store cleaned sentences to cleaned_sentences

        def get_cleaned_sentences(self,tokens, stopwords=False):
    def get_cleaned_sentences(self,tokens, stopwords=False):

            cleaned_sentences = []
        cleaned_sentences = []

            for line in tokens:
        for line in tokens:

                cleaned = self.clean_sentence(line, stopwords)
            cleaned = self.clean_sentence(line, stopwords)

                cleaned_sentences.append(cleaned)
            cleaned_sentences.append(cleaned)

            return cleaned_sentences
        return cleaned_sentences


        #do all the cleaning
    #do all the cleaning

        def cleanall(self):
    def cleanall(self):

            cleaned_sentences = self.get_cleaned_sentences(self.tokens, stopwords=True)
        cleaned_sentences = self.get_cleaned_sentences(self.tokens, stopwords=True)

            cleaned_sentences_with_stopwords = self.get_cleaned_sentences(self.tokens, stopwords=False)
        cleaned_sentences_with_stopwords = self.get_cleaned_sentences(self.tokens, stopwords=False)

            # print(cleaned_sentences)
        # print(cleaned_sentences)

            # print(cleaned_sentences_with_stopwords)
        # print(cleaned_sentences_with_stopwords)

            return [cleaned_sentences,cleaned_sentences_with_stopwords]
        return [cleaned_sentences,cleaned_sentences_with_stopwords]


        # TF-IDF Vectorizer
    # TF-IDF Vectorizer

        def TFIDF(self,cleaned_sentences):
    def TFIDF(self,cleaned_sentences):

            self.tfidfvectoriser.fit(cleaned_sentences)
        self.tfidfvectoriser.fit(cleaned_sentences)

            tfidf_vectors=self.tfidfvectoriser.transform(cleaned_sentences)
        tfidf_vectors=self.tfidfvectoriser.transform(cleaned_sentences)

            return tfidf_vectors
        return tfidf_vectors


        #tfidf for question
    #tfidf for question

        def TFIDF_Q(self,question_to_be_cleaned):
    def TFIDF_Q(self,question_to_be_cleaned):

            tfidf_vectors=self.tfidfvectoriser.transform([question_to_be_cleaned])
        tfidf_vectors=self.tfidfvectoriser.transform([question_to_be_cleaned])

            return tfidf_vectors
        return tfidf_vectors


        # main call function
    # main call function

        def doall(self):
    def doall(self):

            cleaned_sentences, cleaned_sentences_with_stopwords = self.cleanall()
        cleaned_sentences, cleaned_sentences_with_stopwords = self.cleanall()

            tfidf = self.TFIDF(cleaned_sentences)
        tfidf = self.TFIDF(cleaned_sentences)

            return [cleaned_sentences,cleaned_sentences_with_stopwords,tfidf]
        return [cleaned_sentences,cleaned_sentences_with_stopwords,tfidf]


    class TS_SS:
class TS_SS:


        #cosine similarity
    #cosine similarity

        def Cosine(self, question_vector, sentence_vector):
    def Cosine(self, question_vector, sentence_vector):

            dot_product = np.dot(question_vector, sentence_vector.T)
        dot_product = np.dot(question_vector, sentence_vector.T)

            denominator = (np.linalg.norm(question_vector) * np.linalg.norm(sentence_vector))
        denominator = (np.linalg.norm(question_vector) * np.linalg.norm(sentence_vector))

            return dot_product/denominator
        return dot_product/denominator


        #Euclidean distance
    #Euclidean distance

        def Euclidean(self, question_vector, sentence_vector):
    def Euclidean(self, question_vector, sentence_vector):

            vec1 = question_vector.copy()
        vec1 = question_vector.copy()

            vec2 = sentence_vector.copy()
        vec2 = sentence_vector.copy()

            if len(vec1)<len(vec2): vec1,vec2 = vec2,vec1
        if len(vec1)<len(vec2): vec1,vec2 = vec2,vec1

            vec2 = np.resize(vec2,(vec1.shape[0],vec1.shape[1]))
        vec2 = np.resize(vec2,(vec1.shape[0],vec1.shape[1]))

            return np.linalg.norm(vec1-vec2)
        return np.linalg.norm(vec1-vec2)


        # angle between two vectors
    # angle between two vectors

        def Theta(self, question_vector, sentence_vector):
    def Theta(self, question_vector, sentence_vector):

            return np.arccos(self.Cosine(question_vector, sentence_vector)) + np.radians(10)
        return np.arccos(self.Cosine(question_vector, sentence_vector)) + np.radians(10)


        # triangle formed by two vectors and ED as third side
    # triangle formed by two vectors and ED as third side

        def Triangle(self, question_vector, sentence_vector):
    def Triangle(self, question_vector, sentence_vector):

            theta = np.radians(self.Theta(question_vector, sentence_vector))
        theta = np.radians(self.Theta(question_vector, sentence_vector))

            return ((np.linalg.norm(question_vector) * np.linalg.norm(sentence_vector)) * np.sin(theta))/2
        return ((np.linalg.norm(question_vector) * np.linalg.norm(sentence_vector)) * np.sin(theta))/2


        # difference in magnitude of two vectors
    # difference in magnitude of two vectors

        def Magnitude_Difference(self, vec1, vec2):
    def Magnitude_Difference(self, vec1, vec2):

            return abs((np.linalg.norm(vec1) - np.linalg.norm(vec2)))
        return abs((np.linalg.norm(vec1) - np.linalg.norm(vec2)))


        # sector area similarity
    # sector area similarity

        def Sector(self, question_vector, sentence_vector):
    def Sector(self, question_vector, sentence_vector):

            ED = self.Euclidean(question_vector, sentence_vector)
        ED = self.Euclidean(question_vector, sentence_vector)

            MD = self.Magnitude_Difference(question_vector, sentence_vector)
        MD = self.Magnitude_Difference(question_vector, sentence_vector)

            theta = self.Theta(question_vector, sentence_vector)
        theta = self.Theta(question_vector, sentence_vector)

            return math.pi * (ED + MD)**2 * theta/360
        return math.pi * (ED + MD)**2 * theta/360


        #function which is acivated on call
    #function which is acivated on call

        def __call__(self, question_vector, sentence_vector,method):
    def __call__(self, question_vector, sentence_vector,method):

            if method==1: return self.Euclidean(question_vector, sentence_vector)
        if method==1: return self.Euclidean(question_vector, sentence_vector)

            elif method==2: return self.Cosine(question_vector, sentence_vector)
        elif method==2: return self.Cosine(question_vector, sentence_vector)

            else: return self.Triangle(question_vector, sentence_vector) * self.Sector(question_vector, sentence_vector)
        else: return self.Triangle(question_vector, sentence_vector) * self.Sector(question_vector, sentence_vector)


    def RetrieveAnswer(question_embedding, tfidf_vectors,method=1):
def RetrieveAnswer(question_embedding, tfidf_vectors,method=1):

        similarity_heap = []
    similarity_heap = []

        if method==1: max_similarity = float('inf')
    if method==1: max_similarity = float('inf')

        else: max_similarity = -1
    else: max_similarity = -1

        index_similarity = -1
    index_similarity = -1


        for index, embedding in enumerate(tfidf_vectors):
    for index, embedding in enumerate(tfidf_vectors):

            find_similarity = TS_SS()
        find_similarity = TS_SS()

            similarity = find_similarity((question_embedding).toarray(),(embedding).toarray(),method).mean()
        similarity = find_similarity((question_embedding).toarray(),(embedding).toarray(),method).mean()

            if method==1:
        if method==1:

                heapq.heappush(similarity_heap,(similarity,index))
            heapq.heappush(similarity_heap,(similarity,index))

            elif method==2:
        elif method==2:

                heapq.heappush(similarity_heap,(-similarity,index))
            heapq.heappush(similarity_heap,(-similarity,index))

            else:
        else:

                heapq.heappush(similarity_heap,(similarity,index))
            heapq.heappush(similarity_heap,(similarity,index))

        return similarity_heap
    return similarity_heap


    # Put Your question here
# Put Your question here

    user_question = "Musk was born to a Canadian mother and South African father and raised in"
user_question = "Musk was born to a Canadian mother and South African father and raised in"

    #define method
#define method

    method = 3
method = 3


    preprocess = Preprocessing(txt)
preprocess = Preprocessing(txt)

    cleaned_sentences,cleaned_sentences_with_stopwords,tfidf_vectors = preprocess.doall()
cleaned_sentences,cleaned_sentences_with_stopwords,tfidf_vectors = preprocess.doall()

    question = preprocess.clean_sentence(user_question, stopwords=True)
question = preprocess.clean_sentence(user_question, stopwords=True)

    question_embedding = preprocess.TFIDF_Q(question)
question_embedding = preprocess.TFIDF_Q(question)

    similarity_heap = RetrieveAnswer(question_embedding , tfidf_vectors ,method)
similarity_heap = RetrieveAnswer(question_embedding , tfidf_vectors ,method)


    print("Question: ", user_question)
print("Question: ", user_question)


    # number of relevant solutions you want here it will print 2
# number of relevant solutions you want here it will print 2

    number_of_sentences_to_print = 2
number_of_sentences_to_print = 2

    while number_of_sentences_to_print>0 and len(similarity_heap)>0:
while number_of_sentences_to_print>0 and len(similarity_heap)>0:

        x = similarity_heap.pop(0)
    x = similarity_heap.pop(0)

        print(cleaned_sentences_with_stopwords[x[1]])
    print(cleaned_sentences_with_stopwords[x[1]])

        number_of_sentences_to_print-=1
    number_of_sentences_to_print-=1