Created
June 7, 2019 14:11
-
-
Save remyd1/623c99143aecb33eb8f8574cfc18fd7b to your computer and use it in GitHub Desktop.
Basic file to download ncbi DB using aria2 tool and pigz
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#################################### | |
# This script needs pigz and aria2 # | |
# to speedup download and extracts # | |
# otherwise, edit it to use wget # | |
# and a basic tar # | |
#################################### | |
# 20190307 Rémy Dernat. | |
# MBB Team ISE-M Montpellier, France. | |
# last update 20190312 : PATHs hardcoded b/c of the crontab env with sh | |
usage="$0 [nb of connections] | |
eg: $0 8" | |
#ARIA=`which aria2c` | |
ARIA="/usr/local/bin/aria2c" | |
ARIA_ARGS="" | |
#PIGZ=`which pigz` | |
PIGZ="/usr/bin/pigz" | |
BLASTWEBSITE="https://ftp.ncbi.nlm.nih.gov/blast/db/" | |
DATE=`date +"%Y%m%d_%H%M%S"` | |
LOGDIR="/var/log/ncbidb/" | |
if [ ! -d $LOGDIR ]; then | |
mkdir $LOGDIR | |
fi | |
LOGFILE=$LOGDIR"/"$DATE"_ncbidb.log" | |
MAIL="me@mail.com" | |
BIOTMPDIR="/export/bio/ncbi/"$DATE"_db" | |
BIODIR="/export/bio/ncbi/db" | |
mkdir $BIOTMPDIR | |
cd $BIOTMPDIR | |
echo "STARTING DOWNLOAD AT $DATE" > $LOGFILE | |
if [ -n "$1" ]; then | |
if [[ "$1" =~ "[0-9]+" ]]; then | |
echo "using $1 simultaneous connections" | |
ARIA_ARGS="-j$1 -s$1" | |
else | |
echo $usage | |
exit 1 | |
fi | |
fi | |
wget --no-check-certificate -O /tmp/urlblastfulllist.txt $BLASTWEBSITE | |
list_of_files=`awk -F"[=<>]" '/"n[rt].[0-9]{2,3}.tar.gz/ {print $4}' /tmp/urlblastfulllist.txt` | |
for file in $list_of_files | |
do | |
echo $BLASTWEBSITE/$file >> /tmp/urlblastlist.txt | |
# if [[ ! "$file" =~ "md5" ]]; then | |
# $ARIA $ARIA_ARG $BLASTWEBSITE/$file | |
# else | |
# wget --no-check-certificate $BLASTWEBSITE/$file | |
# fi | |
done | |
$ARIA $ARIA_ARG -i /tmp/urlblastlist.txt | |
DATE=`date +"%Y%m%d_%H%M%S"` | |
echo "FINISHING DOWNLOAD AT $DATE" >> $LOGFILE | |
echo "###########################" >> $LOGFILE | |
echo "STARTING CHECKING FILES WITH CHECKSUMS AT $DATE" >> $LOGFILE | |
corrupted_file=0 | |
# checking file integrity | |
for file in $list_of_files | |
do | |
if [[ "$file" =~ "md5" ]]; then | |
file_without_md5=`echo $file |sed -e "s/.md5//"` | |
if [ -f $file_without_md5 ]; then | |
md5sum_calculated=`md5sum $file_without_md5` | |
true_md5sum=`cat $file` | |
if [[ "$md5sum_calculated" != "$true_md5sum" ]]; then | |
echo "$file_without_md5 badly downloaded" >> $LOGFILE | |
corrupted_file=1 | |
else | |
echo "$file_without_md5 is Ok..." >> $LOGFILE | |
fi | |
fi | |
fi | |
done | |
DATE=`date +"%Y%m%d_%H%M%S"` | |
echo "FINISHING CHECKING FILES WITH CHECKSUMS AT $DATE" >> $LOGFILE | |
echo "###########################" >> $LOGFILE | |
echo "STARTING EXTRACT OF FILES AT $DATE" >> $LOGFILE | |
if [[ "$corrupted_file" -eq 1 ]]; then | |
cat $LOGFILE | mail -s "some ncbi db files are corrupted" $MAIL | |
else | |
for file in $list_of_files | |
do | |
if [[ ! "$file" =~ "md5" ]]; then | |
$PIGZ -dc $file | tar -xf - -C $BIODIR | |
fi | |
done | |
# cleaning | |
rm -rf $BIOTMPDIR | |
rm -f /tmp/url*.txt | |
fi | |
DATE=`date +"%Y%m%d_%H%M%S"` | |
echo "FINISHING EXTRACT OF FILES AT $DATE" >> $LOGFILE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment