Skip to content

Instantly share code, notes, and snippets.

@jasonsnell
Last active February 8, 2025 22:02
Show Gist options
  • Save jasonsnell/7df8d908b6c466e916c3ccce7a9e736d to your computer and use it in GitHub Desktop.
Save jasonsnell/7df8d908b6c466e916c3ccce7a9e736d to your computer and use it in GitHub Desktop.
Count Duplicates in List
import subprocess
import re
import sys
from Levenshtein import ratio
from collections import defaultdict, Counter
# Configuration: Adjust as needed
INCLUDE_PARENTHESES = False # Set to True to include parentheticals, False to exclude them
OUTPUT_TO_CLIPBOARD = True # Set to True to copy output to clipboard, False to print to console
def get_clipboard_data():
"""Retrieve clipboard content using pbpaste on macOS."""
try:
return subprocess.check_output("pbpaste", universal_newlines=True).strip()
except subprocess.CalledProcessError as e:
print("Error accessing clipboard:", e)
return ""
def set_clipboard_data(data):
"""Set clipboard content using pbcopy on macOS."""
try:
process = subprocess.Popen("pbcopy", stdin=subprocess.PIPE)
process.communicate(input=data.encode("utf-8"))
except Exception as e:
print("Error setting clipboard:", e)
def normalize_response(response):
"""Normalize response by removing articles, punctuation, extra spaces, and making it lowercase."""
response = response.strip().lower()
response = re.sub(r"[^a-z0-9\s]", "", response) # Remove punctuation
if response.startswith("the "):
response = response[4:] # Remove "the " if present
response = re.sub(r"\s+", " ", response) # Collapse multiple spaces
return response
def group_similar_responses(responses, threshold=0.75):
grouped_responses = defaultdict(list)
# Iterate through all non-blank responses
for response in responses:
if not response.strip(): # Skip blank lines
continue
normalized_response = normalize_response(response)
# Compare with existing groups
for key in grouped_responses:
if ratio(normalized_response, key) >= threshold:
grouped_responses[key].append(response.strip())
break
else:
# No similar group found, create a new group
grouped_responses[normalized_response].append(response.strip())
# Format grouped_responses with unique, case-insensitive coalesced responses
formatted_groups = {}
for key, items in grouped_responses.items():
# Count frequency of each variant
count_variants = Counter(map(str.lower, items))
most_common = count_variants.most_common(1)[0][0] # Most frequent item
most_common_title = next(item for item in items if item.lower() == most_common).title()
# List other unique variants, excluding the most common one
other_variants = sorted(
{item.title() for item in items if item.lower() != most_common}
)
if INCLUDE_PARENTHESES and other_variants:
formatted_groups[f"{most_common_title} ({', '.join(other_variants)})"] = len(items)
else:
formatted_groups[most_common_title] = len(items)
return formatted_groups
def process_responses(input_data):
"""Process and format responses."""
responses = input_data.splitlines()
result = group_similar_responses(responses, threshold=0.75)
# Prepare output
output = "\n".join(f"{key}\t{count}" for key, count in sorted(result.items(), key=lambda x: x[1], reverse=True))
# Output to clipboard or console
if OUTPUT_TO_CLIPBOARD:
set_clipboard_data(output)
print("Output copied to clipboard.")
else:
print(output)
# Main function to handle input and output options
if __name__ == "__main__":
if len(sys.argv) > 1:
# Accept input directly as an argument
input_data = " ".join(sys.argv[1:])
else:
# Use clipboard as input source
input_data = get_clipboard_data()
if not input_data:
print("No clipboard data found!")
sys.exit(1)
process_responses(input_data)
@leoncowle
Copy link

Very cool, Jason. I learned some things!

FYI, a nice trick to avoid needing the found_group variable + if statement, is to utilize 'else' at the end of the for loop:

# Compare with existing groups
for key in grouped_responses:
    if ratio(normalized_response, key) >= threshold:
        grouped_responses[key].append(response.strip())
        break
else:
    # No similar group was found, create a new group
    grouped_responses[normalized_response].append(response.strip())

@jasonsnell
Copy link
Author

I assume ChatGPT wrote that part ;-)

@foresmac
Copy link

foresmac commented Jan 16, 2025

Line 31–32 could be replaced with:
response = response.removeprefix("the ")

I also think you could get ride of the map call to lower when making the counter since you lowercase all the strings when you normalize the data.
I see; the normalized form is only used to group items as a dict key.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment