jasonsnell/countdupes.py

## countdupes.py
import subprocess
import re
import sys
from Levenshtein import ratio
from collections import defaultdict, Counter

# Configuration: Adjust as needed
INCLUDE_PARENTHESES = False  # Set to True to include parentheticals, False to exclude them
OUTPUT_TO_CLIPBOARD = True  # Set to True to copy output to clipboard, False to print to console

def get_clipboard_data():
    """Retrieve clipboard content using pbpaste on macOS."""
    try:
        return subprocess.check_output("pbpaste", universal_newlines=True).strip()
    except subprocess.CalledProcessError as e:
        print("Error accessing clipboard:", e)
        return ""

def set_clipboard_data(data):
    """Set clipboard content using pbcopy on macOS."""
    try:
        process = subprocess.Popen("pbcopy", stdin=subprocess.PIPE)
        process.communicate(input=data.encode("utf-8"))
    except Exception as e:
        print("Error setting clipboard:", e)

def normalize_response(response):
    """Normalize response by removing articles, punctuation, extra spaces, and making it lowercase."""
    response = response.strip().lower()
    response = re.sub(r"[^a-z0-9\s]", "", response)  # Remove punctuation
    if response.startswith("the "):
        response = response[4:]  # Remove "the " if present
    response = re.sub(r"\s+", " ", response)  # Collapse multiple spaces
    return response

def group_similar_responses(responses, threshold=0.75):
    grouped_responses = defaultdict(list)

    # Iterate through all non-blank responses
    for response in responses:
        if not response.strip():  # Skip blank lines
            continue

        normalized_response = normalize_response(response)

        # Compare with existing groups
        for key in grouped_responses:
            if ratio(normalized_response, key) >= threshold:
                grouped_responses[key].append(response.strip())
                break
        else:
        	# No similar group found, create a new group
            grouped_responses[normalized_response].append(response.strip())

    # Format grouped_responses with unique, case-insensitive coalesced responses
    formatted_groups = {}
    for key, items in grouped_responses.items():
        # Count frequency of each variant
        count_variants = Counter(map(str.lower, items))
        most_common = count_variants.most_common(1)[0][0]  # Most frequent item
        most_common_title = next(item for item in items if item.lower() == most_common).title()

        # List other unique variants, excluding the most common one
        other_variants = sorted(
            {item.title() for item in items if item.lower() != most_common}
        )

        if INCLUDE_PARENTHESES and other_variants:
            formatted_groups[f"{most_common_title} ({', '.join(other_variants)})"] = len(items)
        else:
            formatted_groups[most_common_title] = len(items)

    return formatted_groups

def process_responses(input_data):
    """Process and format responses."""
    responses = input_data.splitlines()
    result = group_similar_responses(responses, threshold=0.75)

    # Prepare output
    output = "\n".join(f"{key}\t{count}" for key, count in sorted(result.items(), key=lambda x: x[1], reverse=True))

    # Output to clipboard or console
    if OUTPUT_TO_CLIPBOARD:
        set_clipboard_data(output)
        print("Output copied to clipboard.")
    else:
        print(output)

# Main function to handle input and output options
if __name__ == "__main__":
    if len(sys.argv) > 1:
        # Accept input directly as an argument
        input_data = " ".join(sys.argv[1:])
    else:
        # Use clipboard as input source
        input_data = get_clipboard_data()
        if not input_data:
            print("No clipboard data found!")
            sys.exit(1)

    process_responses(input_data)
	import subprocess
	import re
	import sys
	from Levenshtein import ratio
	from collections import defaultdict, Counter

	# Configuration: Adjust as needed
	INCLUDE_PARENTHESES = False # Set to True to include parentheticals, False to exclude them
	OUTPUT_TO_CLIPBOARD = True # Set to True to copy output to clipboard, False to print to console

	def get_clipboard_data():
	"""Retrieve clipboard content using pbpaste on macOS."""
	try:
	return subprocess.check_output("pbpaste", universal_newlines=True).strip()
	except subprocess.CalledProcessError as e:
	print("Error accessing clipboard:", e)
	return ""

	def set_clipboard_data(data):
	"""Set clipboard content using pbcopy on macOS."""
	try:
	process = subprocess.Popen("pbcopy", stdin=subprocess.PIPE)
	process.communicate(input=data.encode("utf-8"))
	except Exception as e:
	print("Error setting clipboard:", e)

	def normalize_response(response):
	"""Normalize response by removing articles, punctuation, extra spaces, and making it lowercase."""
	response = response.strip().lower()
	response = re.sub(r"[^a-z0-9\s]", "", response) # Remove punctuation
	if response.startswith("the "):
	response = response[4:] # Remove "the " if present
	response = re.sub(r"\s+", " ", response) # Collapse multiple spaces
	return response

	def group_similar_responses(responses, threshold=0.75):
	grouped_responses = defaultdict(list)

	# Iterate through all non-blank responses
	for response in responses:
	if not response.strip(): # Skip blank lines
	continue

	normalized_response = normalize_response(response)

	# Compare with existing groups
	for key in grouped_responses:
	if ratio(normalized_response, key) >= threshold:
	grouped_responses[key].append(response.strip())
	break
	else:
	# No similar group found, create a new group
	grouped_responses[normalized_response].append(response.strip())

	# Format grouped_responses with unique, case-insensitive coalesced responses
	formatted_groups = {}
	for key, items in grouped_responses.items():
	# Count frequency of each variant
	count_variants = Counter(map(str.lower, items))
	most_common = count_variants.most_common(1)[0][0] # Most frequent item
	most_common_title = next(item for item in items if item.lower() == most_common).title()

	# List other unique variants, excluding the most common one
	other_variants = sorted(
	{item.title() for item in items if item.lower() != most_common}
	)

	if INCLUDE_PARENTHESES and other_variants:
	formatted_groups[f"{most_common_title} ({', '.join(other_variants)})"] = len(items)
	else:
	formatted_groups[most_common_title] = len(items)

	return formatted_groups

	def process_responses(input_data):
	"""Process and format responses."""
	responses = input_data.splitlines()
	result = group_similar_responses(responses, threshold=0.75)

	# Prepare output
	output = "\n".join(f"{key}\t{count}" for key, count in sorted(result.items(), key=lambda x: x[1], reverse=True))

	# Output to clipboard or console
	if OUTPUT_TO_CLIPBOARD:
	set_clipboard_data(output)
	print("Output copied to clipboard.")
	else:
	print(output)

	# Main function to handle input and output options
	if __name__ == "__main__":
	if len(sys.argv) > 1:
	# Accept input directly as an argument
	input_data = " ".join(sys.argv[1:])
	else:
	# Use clipboard as input source
	input_data = get_clipboard_data()
	if not input_data:
	print("No clipboard data found!")
	sys.exit(1)

	process_responses(input_data)