thistleknot/tfidf_summ.py

## tfidf_summ.py
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scipy.stats as stats

original_text = [
"Don't incur technical debt, fully define what is proposed.",
"Prefer O'Reilly style writing using examples of time-tested failproof boilerplate solutions with docstring comments.",
"Assume user's expertise: Masters in Data Science, Classical Philosophy, and proficiency in AI, Python, SQL.",
"Always deliver production ready code.",
"Focus on one problem/solution mapping at a time.",
"Walk through the current process including up to any identified bugs.",
"Validate contingent plans by making explicit your strategic approach.",
"Identify potential code issues, including bugs, inefficiencies, and/or poor programming logic.",
"Walk through any proposed process.",
"Define success criteria for validation: inputs, expected outputs, and data processing methods.",
"Provide explicit instructions and define any function definitions you are depending on.",
"Compare expected vs. actual outcomes to understand errors.",
"Evaluate, provide feedback for incorporating, ensure ask met, and identify omissions.",
"Request results for review to understand ongoing issues and explore pivots and possible enhancements.",
"Incorporate contextual nuance.",
"Systematically walkthrough relevant sections of code sections to ensure coherence and correctness.",
"Prioritize logic, optimize code, and favor efficient practices like list comprehensions.",
"Consider approaches from the perspective of scientific, popular opinion and common consensus.",
"Collect your thoughts & take a deep breath.",
"Identify relevant important topics for the task, maintaining a moving commentary across collected topics each response.",
"Restate and rephrase the objective.  If needed, request additional input to respond properly.",
"Understand what processes are parallel vs sequential (iterative).",
"Develop a robust plan, outline processes with dependency graphs, and document functions, parameters, and outputs.",
"Unit test functions/classes using data analysis feature.  Use generic examples.",
"Implement structural debugging with informative output.",
"Begin with the deepest nested code, considering object creation and function calls.",
"    then proceed to write the next layer up (class or set of functions).",
"    When processing each function, think step by step.",
"    repeat until all areas are covered",
"Compare pre/post changes: Analyze the impact of modifications.",
"Identify redundcies, such as similar functions/objects/code, etc",
"    For each similar grouping",
"        Identify what is universal among the sections (high level topic, forms baseline).",
"    Identify differentiating characteristics (additives over baseline)",
"        Synthesize a revised function/object/code containing the best elements from each.",
"        Rewrite the class or functions one at a time."
]


def calculate_tfidf_scores(original_text):
    """
    Calculate the TF-IDF scores for words in the sentences

    :param original_text: a list of strings, where each string is a sentence
    :return: a matrix of shape (N, M), where N is the number of sentences and M is the number of unique words in the corpus
    """
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(original_text)
    return tfidf_matrix


def filter_top_percent_words(tfidf_matrix, sentence, quantiles):
    """
    Filter the words in a sentence based on top n% quantile threshold of TF-IDF scores

    :param tfidf_matrix: a matrix containing the TF-IDF scores of words in sentences
    :param sentence: the sentence represented as a TextBlob object
    :param quantiles: list of quantile threshold values
    :return: a list of list of words, where each list contains the top n% words based on their TF-IDF scores
    """
    num_quantiles = len(quantiles)
    tfidf_scores = tfidf_matrix[i].toarray().flatten()
    max_tfidf_score = max(tfidf_scores)
    word_tfidf_tuples = [(word, tfidf_score) for word, tfidf_score in zip(sentence.words, tfidf_scores)]
    word_tfidf_tuples.sort(key=lambda x: x[1], reverse=True)  # Sort by TF-IDF score in descending order
    total_words = len(word_tfidf_tuples)

    selected_words = []

    for q in quantiles:
        num_words_to_select = int(total_words * q)
        words = [word for word, tfidf_score in word_tfidf_tuples[:num_words_to_select]]
        selected_words.append(words)

    return selected_words

# Tokenize the sentences and calculate the number of tokens in each sentence
token_counts = []
for sentence in original_text:
    blob = TextBlob(sentence)
    tokens = blob.words
    token_counts.append(len(tokens))

# Calculate the mean and standard deviation of tokens per sentence
mean_tokens_per_sentence = np.mean(token_counts)
std_deviation_tokens_per_sentence = np.std(token_counts)

# Calculate the threshold for identifying lines with more tokens than 1 standard deviation above the mean
threshold = mean_tokens_per_sentence + std_deviation_tokens_per_sentence

# Identify lines with more tokens than the threshold
lines_above_threshold = [sentence for sentence, token_count in zip(original_text, token_counts) if token_count > threshold]

quantiles = [0.25, 0.5, 0.66, 0.75, 0.86, 0.91]

# Calculate the TF-IDF scores for words in the sentences
tfidf_matrix = calculate_tfidf_scores(original_text)

# Initialize a list to store the sum of selected words for each sentence
sum_selected_words = [[] for _ in range(len(original_text))]

# Tokenize the sentences
sentences = [TextBlob(sentence) for sentence in original_text]

# Filter the words in each sentence based on top n% quantile threshold and store the ordered lists of words
for i, sentence in enumerate(sentences):
    sum_selected_words[i] = filter_top_percent_words(tfidf_matrix, sentence, quantiles)

def reconstruct_sentences(sentences, sum_selected_words, quantiles):
    """
    Reconstruct sentences while maintaining the original word order for each quantile

    :param sentences: the sentences represented as a list of TextBlob objects
    :param sum_selected_words: the list of lists of top n% words based on their TF-IDF scores for each quantile
    :param quantiles: list of quantile threshold values
    :return: a list of lists of reconstructed sentences for each quantile
    """
    num_quantiles = len(quantiles)
    reconstructed_sentences_quantiles = [[] for _ in range(num_quantiles)]

    for i, sentence in enumerate(sentences):
        for j, q in enumerate(quantiles):
            selected_words = sum_selected_words[i][j]
            selected_word_set = set(selected_words)
            reconstructed_sentence = " ".join(
                [word for word in sentence.words if word in selected_word_set]
            )
            reconstructed_sentences_quantiles[j].append(reconstructed_sentence)
    return reconstructed_sentences_quantiles

# Reconstruct filtered sentences
#reconstructed_sentences = [" ".join(words[-1]) for words in sum_selected_words]

# Reconstruct sentences for each quantile
reconstructed_sentences_quantiles = reconstruct_sentences(sentences, sum_selected_words, quantiles)

# Print the reconstructed sentences for each quantile
for q, reconstructed_sentences in zip(quantiles, reconstructed_sentences_quantiles):
    print(f"{q * 100:.0f}%-th Quantile:")
    for i, rs in enumerate(reconstructed_sentences):
        print(f"Sentence {i + 1}: {rs}")
    print()

[print(s) for s in sentences]

# Print the results
print("Mean Tokens Per Sentence:", mean_tokens_per_sentence)
print("Standard Deviation of Tokens Per Sentence:", std_deviation_tokens_per_sentence)

print("threshold:",threshold)

# Print the lines that are over 1 standard deviation above the mean
print("Lines with more tokens than 1 standard deviation above the mean:")
for line in lines_above_threshold:
    print(line)
	from textblob import TextBlob
	from sklearn.feature_extraction.text import TfidfVectorizer
	import numpy as np
	import scipy.stats as stats

	original_text = [
	"Don't incur technical debt, fully define what is proposed.",
	"Prefer O'Reilly style writing using examples of time-tested failproof boilerplate solutions with docstring comments.",
	"Assume user's expertise: Masters in Data Science, Classical Philosophy, and proficiency in AI, Python, SQL.",
	"Always deliver production ready code.",
	"Focus on one problem/solution mapping at a time.",
	"Walk through the current process including up to any identified bugs.",
	"Validate contingent plans by making explicit your strategic approach.",
	"Identify potential code issues, including bugs, inefficiencies, and/or poor programming logic.",
	"Walk through any proposed process.",
	"Define success criteria for validation: inputs, expected outputs, and data processing methods.",
	"Provide explicit instructions and define any function definitions you are depending on.",
	"Compare expected vs. actual outcomes to understand errors.",
	"Evaluate, provide feedback for incorporating, ensure ask met, and identify omissions.",
	"Request results for review to understand ongoing issues and explore pivots and possible enhancements.",
	"Incorporate contextual nuance.",
	"Systematically walkthrough relevant sections of code sections to ensure coherence and correctness.",
	"Prioritize logic, optimize code, and favor efficient practices like list comprehensions.",
	"Consider approaches from the perspective of scientific, popular opinion and common consensus.",
	"Collect your thoughts & take a deep breath.",
	"Identify relevant important topics for the task, maintaining a moving commentary across collected topics each response.",
	"Restate and rephrase the objective. If needed, request additional input to respond properly.",
	"Understand what processes are parallel vs sequential (iterative).",
	"Develop a robust plan, outline processes with dependency graphs, and document functions, parameters, and outputs.",
	"Unit test functions/classes using data analysis feature. Use generic examples.",
	"Implement structural debugging with informative output.",
	"Begin with the deepest nested code, considering object creation and function calls.",
	" then proceed to write the next layer up (class or set of functions).",
	" When processing each function, think step by step.",
	" repeat until all areas are covered",
	"Compare pre/post changes: Analyze the impact of modifications.",
	"Identify redundcies, such as similar functions/objects/code, etc",
	" For each similar grouping",
	" Identify what is universal among the sections (high level topic, forms baseline).",
	" Identify differentiating characteristics (additives over baseline)",
	" Synthesize a revised function/object/code containing the best elements from each.",
	" Rewrite the class or functions one at a time."
	]


	def calculate_tfidf_scores(original_text):
	"""
	Calculate the TF-IDF scores for words in the sentences

	:param original_text: a list of strings, where each string is a sentence
	:return: a matrix of shape (N, M), where N is the number of sentences and M is the number of unique words in the corpus
	"""
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform(original_text)
	return tfidf_matrix


	def filter_top_percent_words(tfidf_matrix, sentence, quantiles):
	"""
	Filter the words in a sentence based on top n% quantile threshold of TF-IDF scores

	:param tfidf_matrix: a matrix containing the TF-IDF scores of words in sentences
	:param sentence: the sentence represented as a TextBlob object
	:param quantiles: list of quantile threshold values
	:return: a list of list of words, where each list contains the top n% words based on their TF-IDF scores
	"""
	num_quantiles = len(quantiles)
	tfidf_scores = tfidf_matrix[i].toarray().flatten()
	max_tfidf_score = max(tfidf_scores)
	word_tfidf_tuples = [(word, tfidf_score) for word, tfidf_score in zip(sentence.words, tfidf_scores)]
	word_tfidf_tuples.sort(key=lambda x: x[1], reverse=True) # Sort by TF-IDF score in descending order
	total_words = len(word_tfidf_tuples)

	selected_words = []

	for q in quantiles:
	num_words_to_select = int(total_words * q)
	words = [word for word, tfidf_score in word_tfidf_tuples[:num_words_to_select]]
	selected_words.append(words)

	return selected_words

	# Tokenize the sentences and calculate the number of tokens in each sentence
	token_counts = []
	for sentence in original_text:
	blob = TextBlob(sentence)
	tokens = blob.words
	token_counts.append(len(tokens))

	# Calculate the mean and standard deviation of tokens per sentence
	mean_tokens_per_sentence = np.mean(token_counts)
	std_deviation_tokens_per_sentence = np.std(token_counts)

	# Calculate the threshold for identifying lines with more tokens than 1 standard deviation above the mean
	threshold = mean_tokens_per_sentence + std_deviation_tokens_per_sentence

	# Identify lines with more tokens than the threshold
	lines_above_threshold = [sentence for sentence, token_count in zip(original_text, token_counts) if token_count > threshold]

	quantiles = [0.25, 0.5, 0.66, 0.75, 0.86, 0.91]

	# Calculate the TF-IDF scores for words in the sentences
	tfidf_matrix = calculate_tfidf_scores(original_text)

	# Initialize a list to store the sum of selected words for each sentence
	sum_selected_words = [[] for _ in range(len(original_text))]

	# Tokenize the sentences
	sentences = [TextBlob(sentence) for sentence in original_text]

	# Filter the words in each sentence based on top n% quantile threshold and store the ordered lists of words
	for i, sentence in enumerate(sentences):
	sum_selected_words[i] = filter_top_percent_words(tfidf_matrix, sentence, quantiles)

	def reconstruct_sentences(sentences, sum_selected_words, quantiles):
	"""
	Reconstruct sentences while maintaining the original word order for each quantile

	:param sentences: the sentences represented as a list of TextBlob objects
	:param sum_selected_words: the list of lists of top n% words based on their TF-IDF scores for each quantile
	:param quantiles: list of quantile threshold values
	:return: a list of lists of reconstructed sentences for each quantile
	"""
	num_quantiles = len(quantiles)
	reconstructed_sentences_quantiles = [[] for _ in range(num_quantiles)]

	for i, sentence in enumerate(sentences):
	for j, q in enumerate(quantiles):
	selected_words = sum_selected_words[i][j]
	selected_word_set = set(selected_words)
	reconstructed_sentence = " ".join(
	[word for word in sentence.words if word in selected_word_set]
	)
	reconstructed_sentences_quantiles[j].append(reconstructed_sentence)
	return reconstructed_sentences_quantiles

	# Reconstruct filtered sentences
	#reconstructed_sentences = [" ".join(words[-1]) for words in sum_selected_words]

	# Reconstruct sentences for each quantile
	reconstructed_sentences_quantiles = reconstruct_sentences(sentences, sum_selected_words, quantiles)

	# Print the reconstructed sentences for each quantile
	for q, reconstructed_sentences in zip(quantiles, reconstructed_sentences_quantiles):
	print(f"{q * 100:.0f}%-th Quantile:")
	for i, rs in enumerate(reconstructed_sentences):
	print(f"Sentence {i + 1}: {rs}")
	print()

	[print(s) for s in sentences]

	# Print the results
	print("Mean Tokens Per Sentence:", mean_tokens_per_sentence)
	print("Standard Deviation of Tokens Per Sentence:", std_deviation_tokens_per_sentence)

	print("threshold:",threshold)

	# Print the lines that are over 1 standard deviation above the mean
	print("Lines with more tokens than 1 standard deviation above the mean:")
	for line in lines_above_threshold:
	print(line)