Skip to content

Instantly share code, notes, and snippets.

@printminion
Created April 4, 2016 15:17
Show Gist options
  • Save printminion/abbac1dcd7b123d67b510b5272220c5b to your computer and use it in GitHub Desktop.
Save printminion/abbac1dcd7b123d67b510b5272220c5b to your computer and use it in GitHub Desktop.
Check files if they have duplicated lines
#!/usr/bin/env bash
# @desc Check files if they have duplicated lines
# @author Misha M.-Kupriyanov https://google.com/+MishaMKupriyanov
# @link https://gist.github.com/printminion/abbac1dcd7b123d67b510b5272220c5b
if [ -z "$1" ]
then
echo "Usage:"
echo " ./test_duplicates.sh *.csv - test all files in folder"
echo " ./test_duplicates.sh somefile.csv - test one file in folder"
echo " exits with 1 if any duplicates were found"
exit 0
fi
file_selector=$1
echo "Validate files $file_selector for line duplicates"
FILES="$PWD/$file_selector"
echo $FILES
files_count=`echo $FILES | wc -w`
echo "[i]proceed with $files_count files"
echo -e "file\tduplicates"
duplicates_grouped_count=0
files_with_duplicates_count=0
for file in $FILES
do
file_name=$(basename $file)
echo -n "$file_name"
sorted_by_duplicates=`cat $file | sort | uniq -c | sort -nr`
line_count=0
test_output=''
# while read line
# do
# line_count=$[line_count +1]
#
#
# test_output_line=`echo -n -e "$line_count\t"`
# test_output_line="$test_output_line `echo $line | md5sum | awk '{print $1}'`"
#
#
# #output line number with md5 of string
# #echo $test_output_line
#
# test_output="$test_output\n$test_output_line"
# done < $file
#echo -e $test_output
#echo "$sorted_by_duplicates"
duplicates_count=`echo -e "$sorted_by_duplicates" | awk '$1+0 > 1' | wc -l`
echo -e "\t${duplicates_count}"
duplicates_grouped_count=$[duplicates_grouped_count + duplicates_count]
if [ "x$duplicates_count" != "x0" ]
then
files_with_duplicates_count=$[$files_with_duplicates_count + 1]
fi
done
if [ -z "$duplicates_grouped_count" ]
then
echo "[i]0 duplicates"
exit 0
else
file_selector=$1
echo "[e]Error $duplicates_grouped_count lines with duplicates in $files_with_duplicates_count files!"
exit 1
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment