Created
August 4, 2019 15:01
-
-
Save jesteves/3bc3f3058d30656af6fdd912b146667f to your computer and use it in GitHub Desktop.
find_duplicates - Script de shell para encontrar archivos duplicados con base en su SHA-1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# find_duplicates.sh | |
# | |
# Recorre los subdirectorios no ocultos del directorio casa del usuario | |
# actual y registra en una base de datos de SQLite las rutas de todos los | |
# archivos regulares encontrados, junto con su firma SHA-1. Terminado el | |
# recorrido genera un reporte csv con la firma y la ruta de los archivos | |
# con más de una ocurrencia. | |
# | |
# Uso: | |
# | |
# $ cp find_duplicates.sh /tmp | |
# | |
# $ bash /tmp/find_duplicates.sh | |
# ó | |
# $ chmod +ax /tmp/find_duplicates.sh | |
# $ /tmp/find_duplicates.sh | |
# | |
# Dependencias: | |
# | |
# - bash | |
# - find, grep, awk | |
# - sqlite3 | |
# - shasum | |
# | |
# | |
export WORKDIR=/tmp/finddups | |
echo "==> Buscando archivos duplicados bajo $HOME..." | |
# Inicializa el directorio de trabajo. Crea la base de datos y scripts auxiliares. | |
echo "---> Inicializando directorio de trabajo ($WORKDIR)..." | |
mkdir -p $WORKDIR | |
rm -f $WORKDIR/db | |
sqlite3 $WORKDIR/db <<EOF | |
create table sha_file (sha1 varchar(40) not null, filepath varchar(2000) not null); | |
create unique index ix_sf_fp on sha_file (filepath); | |
create index ix_sf_sha on sha_file (sha1); | |
create table sha_count (sha1 varchar(40) not null, count integer); | |
create unique index ix_sc_sha on sha_count (sha1); | |
EOF | |
rm -f $WORKDIR/sqlize.awk | |
cat <<EOF >$WORKDIR/sqlize.awk | |
{ | |
sha1=\$1; | |
filepath=substr(\$0, 43); | |
gsub(/\\r/, "", filepath); | |
printf "insert into sha_file values('%s', '%s');\\n", sha1, filepath; | |
} | |
{ | |
# actividad... | |
printf "." > "/dev/stderr" | |
} | |
EOF | |
# Genera la lista de subdirectorioa a recorrer y realiza el recorrido, alimentando | |
# la base de datos de firmas digitales. | |
echo "---> Generando lista de subdirectorios a recorrer..." | |
find $HOME -type d -mindepth 1 -maxdepth 1 -exec basename {} \; \ | |
| grep -v '^[.]' \ | |
>$WORKDIR/dirlist | |
echo "---> Recorriendo..." | |
SAVEIFS=$IFS | |
IFS=$(echo -en "\n\b") | |
for d in $(cat $WORKDIR/dirlist) | |
do | |
echo -n "[$d]" | |
find "$HOME/$d" \ | |
-type f \ | |
-user $USER \ | |
-not -name .DS_Store \ | |
-not -size 0 \ | |
-exec shasum {} \; \ | |
| awk -f $WORKDIR/sqlize.awk \ | |
| sqlite3 $WORKDIR/db | |
done | |
IFS=$SAVEIFS | |
echo | |
echo "---> Registrando conteo general e identificando duplicados" | |
sqlite3 $WORKDIR/db <<EOF | |
insert into sha_count select sha1, count(sha1) from sha_file group by sha1; | |
delete from sha_count where count=1; | |
EOF | |
# Genera el reporte. | |
DUPS_REPORT=WORKDIR/dups.report.csv | |
echo "==> Generando reporte en $DUPS_REPORT" | |
echo "---> (i) Los grupos de archivos con la misma llave son idénticos." | |
sqlite3 $WORKDIR/db <<EOF >$DUPS_REPORT | |
select sf.sha1, sf.filepath | |
from sha_file sf, sha_count sc | |
where sf.sha1=sc.sha1 | |
order by sf.sha1, sf.filepath; | |
EOF | |
echo |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment