Created
April 4, 2016 15:17
-
-
Save printminion/abbac1dcd7b123d67b510b5272220c5b to your computer and use it in GitHub Desktop.
Check files if they have duplicated lines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# @desc Check files if they have duplicated lines | |
# @author Misha M.-Kupriyanov https://google.com/+MishaMKupriyanov | |
# @link https://gist.github.com/printminion/abbac1dcd7b123d67b510b5272220c5b | |
if [ -z "$1" ] | |
then | |
echo "Usage:" | |
echo " ./test_duplicates.sh *.csv - test all files in folder" | |
echo " ./test_duplicates.sh somefile.csv - test one file in folder" | |
echo " exits with 1 if any duplicates were found" | |
exit 0 | |
fi | |
file_selector=$1 | |
echo "Validate files $file_selector for line duplicates" | |
FILES="$PWD/$file_selector" | |
echo $FILES | |
files_count=`echo $FILES | wc -w` | |
echo "[i]proceed with $files_count files" | |
echo -e "file\tduplicates" | |
duplicates_grouped_count=0 | |
files_with_duplicates_count=0 | |
for file in $FILES | |
do | |
file_name=$(basename $file) | |
echo -n "$file_name" | |
sorted_by_duplicates=`cat $file | sort | uniq -c | sort -nr` | |
line_count=0 | |
test_output='' | |
# while read line | |
# do | |
# line_count=$[line_count +1] | |
# | |
# | |
# test_output_line=`echo -n -e "$line_count\t"` | |
# test_output_line="$test_output_line `echo $line | md5sum | awk '{print $1}'`" | |
# | |
# | |
# #output line number with md5 of string | |
# #echo $test_output_line | |
# | |
# test_output="$test_output\n$test_output_line" | |
# done < $file | |
#echo -e $test_output | |
#echo "$sorted_by_duplicates" | |
duplicates_count=`echo -e "$sorted_by_duplicates" | awk '$1+0 > 1' | wc -l` | |
echo -e "\t${duplicates_count}" | |
duplicates_grouped_count=$[duplicates_grouped_count + duplicates_count] | |
if [ "x$duplicates_count" != "x0" ] | |
then | |
files_with_duplicates_count=$[$files_with_duplicates_count + 1] | |
fi | |
done | |
if [ -z "$duplicates_grouped_count" ] | |
then | |
echo "[i]0 duplicates" | |
exit 0 | |
else | |
file_selector=$1 | |
echo "[e]Error $duplicates_grouped_count lines with duplicates in $files_with_duplicates_count files!" | |
exit 1 | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment