Last active
June 18, 2024 05:38
-
-
Save mike-seger/78f625ea843b57213e049c9f3e417e86 to your computer and use it in GitHub Desktop.
Duplicate file report
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Usage: ./duplicate_file_report.sh /path/to/root_directory | |
# Check if root directory is provided | |
if [ -z "$1" ]; then | |
echo "Usage: $0 /path/to/root_directory" | |
exit 1 | |
fi | |
root_dir=$1 | |
output_file1="report1.tsv" | |
output_file2="report2.tsv" | |
# Create temporary files | |
temp_md5_file=$(mktemp) | |
temp_count_file=$(mktemp) | |
temp_matching_lines=$(mktemp) | |
# Traverse the directory, compute MD5 sums, and store the results in a temporary file | |
find "$root_dir" -type f -exec md5sum {} + | awk '{print $1 "\t" $2}' > "$temp_md5_file" | |
# Count the number of repetitions of each MD5 sum | |
awk '{print $1}' "$temp_md5_file" | sort | uniq -c | sort -nr | awk '{print $1 "\t" $2}' > "$temp_count_file" | |
# Count the number of lines matching [A-Z-a-z0-9]{2,} for each file | |
while IFS=$'\t' read -r md5sum filepath; do | |
match_count=$(grep -E '[A-Za-z0-9]{2,}' "$filepath" | wc -l) | |
echo -e "$md5sum\t$match_count" | |
done < "$temp_md5_file" > "$temp_matching_lines" | |
# Generate output file 1 | |
join -t $'\t' -1 2 -2 1 <(sort -k2 "$temp_count_file") <(sort -k1 "$temp_matching_lines") | \ | |
awk '{print $2 "\t" $1 "\t" $3}' | sort -nr > "$output_file1" | |
# Generate output file 2 | |
sort -k2 "$temp_md5_file" > "$output_file2" | |
# Cleanup temporary files | |
rm "$temp_md5_file" "$temp_count_file" "$temp_matching_lines" | |
echo "Report generated: $output_file1 and $output_file2" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment