mike-seger/dupfile-report.sh

## dupfile-report.sh
#!/bin/bash

# Usage: ./duplicate_file_report.sh /path/to/root_directory

# Check if root directory is provided
if [ -z "$1" ]; then
  echo "Usage: $0 /path/to/root_directory"
  exit 1
fi

root_dir=$1
output_file1="report1.tsv"
output_file2="report2.tsv"

# Create temporary files
temp_md5_file=$(mktemp)
temp_count_file=$(mktemp)
temp_matching_lines=$(mktemp)

# Traverse the directory, compute MD5 sums, and store the results in a temporary file
find "$root_dir" -type f -exec md5sum {} + | awk '{print $1 "\t" $2}' > "$temp_md5_file"

# Count the number of repetitions of each MD5 sum
awk '{print $1}' "$temp_md5_file" | sort | uniq -c | sort -nr | awk '{print $1 "\t" $2}' > "$temp_count_file"

# Count the number of lines matching [A-Z-a-z0-9]{2,} for each file
while IFS=$'\t' read -r md5sum filepath; do
  match_count=$(grep -E '[A-Za-z0-9]{2,}' "$filepath" | wc -l)
  echo -e "$md5sum\t$match_count"
done < "$temp_md5_file" > "$temp_matching_lines"

# Generate output file 1
join -t $'\t' -1 2 -2 1 <(sort -k2 "$temp_count_file") <(sort -k1 "$temp_matching_lines") | \
  awk '{print $2 "\t" $1 "\t" $3}' | sort -nr > "$output_file1"

# Generate output file 2
sort -k2 "$temp_md5_file" > "$output_file2"

# Cleanup temporary files
rm "$temp_md5_file" "$temp_count_file" "$temp_matching_lines"

echo "Report generated: $output_file1 and $output_file2"
	#!/bin/bash

	# Usage: ./duplicate_file_report.sh /path/to/root_directory

	# Check if root directory is provided
	if [ -z "$1" ]; then
	echo "Usage: $0 /path/to/root_directory"
	exit 1
	fi

	root_dir=$1
	output_file1="report1.tsv"
	output_file2="report2.tsv"

	# Create temporary files
	temp_md5_file=$(mktemp)
	temp_count_file=$(mktemp)
	temp_matching_lines=$(mktemp)

	# Traverse the directory, compute MD5 sums, and store the results in a temporary file
	find "$root_dir" -type f -exec md5sum {} + \| awk '{print $1 "\t" $2}' > "$temp_md5_file"

	# Count the number of repetitions of each MD5 sum
	awk '{print $1}' "$temp_md5_file" \| sort \| uniq -c \| sort -nr \| awk '{print $1 "\t" $2}' > "$temp_count_file"

	# Count the number of lines matching [A-Z-a-z0-9]{2,} for each file
	while IFS=$'\t' read -r md5sum filepath; do
	match_count=$(grep -E '[A-Za-z0-9]{2,}' "$filepath" \| wc -l)
	echo -e "$md5sum\t$match_count"
	done < "$temp_md5_file" > "$temp_matching_lines"

	# Generate output file 1
	join -t $'\t' -1 2 -2 1 <(sort -k2 "$temp_count_file") <(sort -k1 "$temp_matching_lines") \| \
	awk '{print $2 "\t" $1 "\t" $3}' \| sort -nr > "$output_file1"

	# Generate output file 2
	sort -k2 "$temp_md5_file" > "$output_file2"

	# Cleanup temporary files
	rm "$temp_md5_file" "$temp_count_file" "$temp_matching_lines"

	echo "Report generated: $output_file1 and $output_file2"