Last active
February 8, 2025 22:02
-
-
Save jasonsnell/7df8d908b6c466e916c3ccce7a9e736d to your computer and use it in GitHub Desktop.
Count Duplicates in List
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import re | |
import sys | |
from Levenshtein import ratio | |
from collections import defaultdict, Counter | |
# Configuration: Adjust as needed | |
INCLUDE_PARENTHESES = False # Set to True to include parentheticals, False to exclude them | |
OUTPUT_TO_CLIPBOARD = True # Set to True to copy output to clipboard, False to print to console | |
def get_clipboard_data(): | |
"""Retrieve clipboard content using pbpaste on macOS.""" | |
try: | |
return subprocess.check_output("pbpaste", universal_newlines=True).strip() | |
except subprocess.CalledProcessError as e: | |
print("Error accessing clipboard:", e) | |
return "" | |
def set_clipboard_data(data): | |
"""Set clipboard content using pbcopy on macOS.""" | |
try: | |
process = subprocess.Popen("pbcopy", stdin=subprocess.PIPE) | |
process.communicate(input=data.encode("utf-8")) | |
except Exception as e: | |
print("Error setting clipboard:", e) | |
def normalize_response(response): | |
"""Normalize response by removing articles, punctuation, extra spaces, and making it lowercase.""" | |
response = response.strip().lower() | |
response = re.sub(r"[^a-z0-9\s]", "", response) # Remove punctuation | |
if response.startswith("the "): | |
response = response[4:] # Remove "the " if present | |
response = re.sub(r"\s+", " ", response) # Collapse multiple spaces | |
return response | |
def group_similar_responses(responses, threshold=0.75): | |
grouped_responses = defaultdict(list) | |
# Iterate through all non-blank responses | |
for response in responses: | |
if not response.strip(): # Skip blank lines | |
continue | |
normalized_response = normalize_response(response) | |
# Compare with existing groups | |
for key in grouped_responses: | |
if ratio(normalized_response, key) >= threshold: | |
grouped_responses[key].append(response.strip()) | |
break | |
else: | |
# No similar group found, create a new group | |
grouped_responses[normalized_response].append(response.strip()) | |
# Format grouped_responses with unique, case-insensitive coalesced responses | |
formatted_groups = {} | |
for key, items in grouped_responses.items(): | |
# Count frequency of each variant | |
count_variants = Counter(map(str.lower, items)) | |
most_common = count_variants.most_common(1)[0][0] # Most frequent item | |
most_common_title = next(item for item in items if item.lower() == most_common).title() | |
# List other unique variants, excluding the most common one | |
other_variants = sorted( | |
{item.title() for item in items if item.lower() != most_common} | |
) | |
if INCLUDE_PARENTHESES and other_variants: | |
formatted_groups[f"{most_common_title} ({', '.join(other_variants)})"] = len(items) | |
else: | |
formatted_groups[most_common_title] = len(items) | |
return formatted_groups | |
def process_responses(input_data): | |
"""Process and format responses.""" | |
responses = input_data.splitlines() | |
result = group_similar_responses(responses, threshold=0.75) | |
# Prepare output | |
output = "\n".join(f"{key}\t{count}" for key, count in sorted(result.items(), key=lambda x: x[1], reverse=True)) | |
# Output to clipboard or console | |
if OUTPUT_TO_CLIPBOARD: | |
set_clipboard_data(output) | |
print("Output copied to clipboard.") | |
else: | |
print(output) | |
# Main function to handle input and output options | |
if __name__ == "__main__": | |
if len(sys.argv) > 1: | |
# Accept input directly as an argument | |
input_data = " ".join(sys.argv[1:]) | |
else: | |
# Use clipboard as input source | |
input_data = get_clipboard_data() | |
if not input_data: | |
print("No clipboard data found!") | |
sys.exit(1) | |
process_responses(input_data) |
I assume ChatGPT wrote that part ;-)
Line 31–32 could be replaced with:
response = response.removeprefix("the ")
I also think you could get ride of the map call to lower when making the counter since you lowercase all the strings when you normalize the data.
I see; the normalized form is only used to group items as a dict key.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Very cool, Jason. I learned some things!
FYI, a nice trick to avoid needing the found_group variable + if statement, is to utilize 'else' at the end of the for loop: