skarlekar/prompt-compare.py

## prompt-compare.py
"""
This Python script provides a utility to compute the cosine similarity between two text sentences using the TF-IDF
(Term Frequency-Inverse Document Frequency) vectorization approach.

Key Components:
1. Import Statements: The script begins by importing necessary modules:
   - TfidfVectorizer from sklearn.feature_extraction.text for converting text data into a matrix of TF-IDF features.
   - cosine_similarity from sklearn.metrics.pairwise to compute the similarity between two vectors in the TF-IDF space.
   - sys for accessing command-line arguments.

2. Function get_sentences(): This function retrieves the sentences either from command-line arguments or interactive
user input if no arguments are provided. It ensures the program can work flexibly in different usage contexts.

3. Function calculate_similarity(): Accepts two sentences and computes their similarity. It uses the TF-IDF vectorizer
to transform the sentences into vectors and then calculates the cosine similarity between these vectors. The similarity
is also converted into a percentage format for easier interpretation.

4. Function main(): Orchestrates the flow by calling get_sentences() to retrieve the sentences, computes their similarity
using calculate_similarity(), and then prints the results.

5. Execution Check: The script checks if it's run as the main program and calls the main() function accordingly, ensuring
it doesn't run when imported as a module.

This script is useful for applications like plagiarism detection, semantic search, or text data preprocessing where
understanding the degree of similarity between texts is crucial.
"""

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sys

def get_sentences():
    # Check if sufficient arguments are provided
    if len(sys.argv) == 3:
        sentence1, sentence2 = sys.argv[1], sys.argv[2]
    else:
        # Prompt user for input if arguments are not provided
        sentence1 = input("Please enter the first sentence: ")
        sentence2 = input("Please enter the second sentence: ")

    return sentence1, sentence2

def calculate_similarity(sentence1=None, sentence2=None):
    if sentence1 is None or sentence2 is None:
        print("Error: You have to pass two sentences for calculating similarity", file=sys.stderr)
        return None, None

    # Vectorize the sentences
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([sentence1, sentence2])

    # Compute cosine similarity
    cosine_sim = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

    # Convert to percentage
    percentage_similarity = 50 * (cosine_sim + 1)  # Scale from 0% to 100%

    return cosine_sim, percentage_similarity

def main():
    sentence1, sentence2 = get_sentences()
    print(f"Sentence 1: {sentence1}")
    print(f"Sentence 2: {sentence2}")
    similarity_score, similarity_percentage = calculate_similarity(sentence1, sentence2)
    print(f"Cosine Similarity Score: {similarity_score}")
    print(f"Similarity Percentage: {similarity_percentage}%")

if __name__ == "__main__":
    main()
	"""
	This Python script provides a utility to compute the cosine similarity between two text sentences using the TF-IDF
	(Term Frequency-Inverse Document Frequency) vectorization approach.

	Key Components:
	1. Import Statements: The script begins by importing necessary modules:
	- TfidfVectorizer from sklearn.feature_extraction.text for converting text data into a matrix of TF-IDF features.
	- cosine_similarity from sklearn.metrics.pairwise to compute the similarity between two vectors in the TF-IDF space.
	- sys for accessing command-line arguments.

	2. Function get_sentences(): This function retrieves the sentences either from command-line arguments or interactive
	user input if no arguments are provided. It ensures the program can work flexibly in different usage contexts.

	3. Function calculate_similarity(): Accepts two sentences and computes their similarity. It uses the TF-IDF vectorizer
	to transform the sentences into vectors and then calculates the cosine similarity between these vectors. The similarity
	is also converted into a percentage format for easier interpretation.

	4. Function main(): Orchestrates the flow by calling get_sentences() to retrieve the sentences, computes their similarity
	using calculate_similarity(), and then prints the results.

	5. Execution Check: The script checks if it's run as the main program and calls the main() function accordingly, ensuring
	it doesn't run when imported as a module.

	This script is useful for applications like plagiarism detection, semantic search, or text data preprocessing where
	understanding the degree of similarity between texts is crucial.
	"""

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import sys

	def get_sentences():
	# Check if sufficient arguments are provided
	if len(sys.argv) == 3:
	sentence1, sentence2 = sys.argv[1], sys.argv[2]
	else:
	# Prompt user for input if arguments are not provided
	sentence1 = input("Please enter the first sentence: ")
	sentence2 = input("Please enter the second sentence: ")

	return sentence1, sentence2

	def calculate_similarity(sentence1=None, sentence2=None):
	if sentence1 is None or sentence2 is None:
	print("Error: You have to pass two sentences for calculating similarity", file=sys.stderr)
	return None, None

	# Vectorize the sentences
	vectorizer = TfidfVectorizer()
	vectors = vectorizer.fit_transform([sentence1, sentence2])

	# Compute cosine similarity
	cosine_sim = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]

	# Convert to percentage
	percentage_similarity = 50 * (cosine_sim + 1) # Scale from 0% to 100%

	return cosine_sim, percentage_similarity

	def main():
	sentence1, sentence2 = get_sentences()
	print(f"Sentence 1: {sentence1}")
	print(f"Sentence 2: {sentence2}")
	similarity_score, similarity_percentage = calculate_similarity(sentence1, sentence2)
	print(f"Cosine Similarity Score: {similarity_score}")
	print(f"Similarity Percentage: {similarity_percentage}%")

	if __name__ == "__main__":
	main()