Skip to content

Instantly share code, notes, and snippets.

@remyd1
Created June 7, 2019 14:11
Show Gist options
  • Save remyd1/623c99143aecb33eb8f8574cfc18fd7b to your computer and use it in GitHub Desktop.
Save remyd1/623c99143aecb33eb8f8574cfc18fd7b to your computer and use it in GitHub Desktop.
Basic file to download ncbi DB using aria2 tool and pigz
#!/bin/bash
####################################
# This script needs pigz and aria2 #
# to speedup download and extracts #
# otherwise, edit it to use wget #
# and a basic tar #
####################################
# 20190307 Rémy Dernat.
# MBB Team ISE-M Montpellier, France.
# last update 20190312 : PATHs hardcoded b/c of the crontab env with sh
usage="$0 [nb of connections]
eg: $0 8"
#ARIA=`which aria2c`
ARIA="/usr/local/bin/aria2c"
ARIA_ARGS=""
#PIGZ=`which pigz`
PIGZ="/usr/bin/pigz"
BLASTWEBSITE="https://ftp.ncbi.nlm.nih.gov/blast/db/"
DATE=`date +"%Y%m%d_%H%M%S"`
LOGDIR="/var/log/ncbidb/"
if [ ! -d $LOGDIR ]; then
mkdir $LOGDIR
fi
LOGFILE=$LOGDIR"/"$DATE"_ncbidb.log"
MAIL="me@mail.com"
BIOTMPDIR="/export/bio/ncbi/"$DATE"_db"
BIODIR="/export/bio/ncbi/db"
mkdir $BIOTMPDIR
cd $BIOTMPDIR
echo "STARTING DOWNLOAD AT $DATE" > $LOGFILE
if [ -n "$1" ]; then
if [[ "$1" =~ "[0-9]+" ]]; then
echo "using $1 simultaneous connections"
ARIA_ARGS="-j$1 -s$1"
else
echo $usage
exit 1
fi
fi
wget --no-check-certificate -O /tmp/urlblastfulllist.txt $BLASTWEBSITE
list_of_files=`awk -F"[=<>]" '/"n[rt].[0-9]{2,3}.tar.gz/ {print $4}' /tmp/urlblastfulllist.txt`
for file in $list_of_files
do
echo $BLASTWEBSITE/$file >> /tmp/urlblastlist.txt
# if [[ ! "$file" =~ "md5" ]]; then
# $ARIA $ARIA_ARG $BLASTWEBSITE/$file
# else
# wget --no-check-certificate $BLASTWEBSITE/$file
# fi
done
$ARIA $ARIA_ARG -i /tmp/urlblastlist.txt
DATE=`date +"%Y%m%d_%H%M%S"`
echo "FINISHING DOWNLOAD AT $DATE" >> $LOGFILE
echo "###########################" >> $LOGFILE
echo "STARTING CHECKING FILES WITH CHECKSUMS AT $DATE" >> $LOGFILE
corrupted_file=0
# checking file integrity
for file in $list_of_files
do
if [[ "$file" =~ "md5" ]]; then
file_without_md5=`echo $file |sed -e "s/.md5//"`
if [ -f $file_without_md5 ]; then
md5sum_calculated=`md5sum $file_without_md5`
true_md5sum=`cat $file`
if [[ "$md5sum_calculated" != "$true_md5sum" ]]; then
echo "$file_without_md5 badly downloaded" >> $LOGFILE
corrupted_file=1
else
echo "$file_without_md5 is Ok..." >> $LOGFILE
fi
fi
fi
done
DATE=`date +"%Y%m%d_%H%M%S"`
echo "FINISHING CHECKING FILES WITH CHECKSUMS AT $DATE" >> $LOGFILE
echo "###########################" >> $LOGFILE
echo "STARTING EXTRACT OF FILES AT $DATE" >> $LOGFILE
if [[ "$corrupted_file" -eq 1 ]]; then
cat $LOGFILE | mail -s "some ncbi db files are corrupted" $MAIL
else
for file in $list_of_files
do
if [[ ! "$file" =~ "md5" ]]; then
$PIGZ -dc $file | tar -xf - -C $BIODIR
fi
done
# cleaning
rm -rf $BIOTMPDIR
rm -f /tmp/url*.txt
fi
DATE=`date +"%Y%m%d_%H%M%S"`
echo "FINISHING EXTRACT OF FILES AT $DATE" >> $LOGFILE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment