tluquez/RNAseq_alldata_download.sh

## RNAseq_alldata_download.sh
#!/usr/bin/env bash

# MIT License
#
# Copyright (c) [2018] [Grupo de Bioquimica Experimental y Computacional from the Pontificia Universidad Javeriana and Tain Velasco-Luquez]
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Author:
#Tain Velasco-Luquez <tvelasco@javeriana.edu.co> @TainVelasco-Luquez: Design and implementation

# Last updated 12/07/2018

##### Usage
usage() {
	printf "
  $(basename "$0") -- Program to download the human reference genome (GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz), the annotation file for that human reference genome (GCA_000001405.15_GRCh38_full_analysis_set.refseq_annotation.gff.gz) and also downloads all .fastq files from ENA supplied by the user in a text file.

  Optional:
  --help -h  Show this help text
  -First_argument  Path to save the data
  -uncompress  If \"true\" then apply gunzip to all .gz

  Mandatory:
  -Second_argument  File with the aspera urls downloaded from ENA
  -Third_argument  The max transfer rate over the network (e.g 300m or 600m)
  -Fourth_argument  Path to the asperaweb_id_dsa.openssh (e.g. ~/Applications/Aspera\Connect.app/Contents/Resources/asperaweb_id_dsa.openssh)

  Requirements:
  1. This file assumes aspera CLI is installed and its path properly exported
	"
}

##### Check for help option
if [ "$1" == "--help" ] || [ "$1" == "-h" ]; then
	usage
	exit 0
fi

##### Check for arguments
if [ $# -eq 0 ]
then
	printf "No arguments supplied."
	usage
	exit 1
fi

##### Variables
my_path=$1  # Path to save the database
asp_url=$2  # File with the aspera url downloaded from ENA
max_transfer_rate=$3  # Max transfer rate (e.g 300m)
aspera_path=$4  # Path to the asperaweb_id_dsa.openssh (e.g. ~/Applications/Aspera\ Connect.app/Contents/Resources/asperaweb_id_dsa.openssh)
uncompress=$5

# Lets start by downloading the reference genome according to the instructions in RNA_processing.md
printf "Downloading the reference genome GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz \n"
rsync -avP rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz "${my_path}"
printf "\n DONE \n"

# Now lets download the annotation file for the same version and from the same source as the reference genome, avoiding in this way incomaptibilities
printf "Downloading the reference annotation GCA_000001405.15_GRCh38_full_analysis_set.refseq_annotation.gff.gz \n"
rsync -avP rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.refseq_annotation.gff.gz "${my_path}"
printf "\n DONE \n"

# After download the file with the aspera url, lets remove the header (first line) and save the file with vim
vim -c ':1d' -c ':wq' "${asp_url}"
printf "The number of files to be downloaded from ENA is:"
wc -l "${asp_url}"  # How many fastq files are in the study?

# Lets download the RNAseq data associated to the study accession number SRP103788
if [[ $PATH = *CLI/bin* ]];
then
	printf "Downloading the files in ${asp_url} \n"
	while read every_line; do
		ascp -QT -l "${max_transfer_rate}" -P33001 -i "${aspera_path}" era-fasp@$every_line "${my_path}"
		printf $every_line " is downloaded"
	done < "${asp_url}";
	printf "All files have been downloaded \n" >&1
	exit 0
else
	printf "Make sure you export the aspera CLI path. \n" >&2
	exit 1
fi

# Finally, lets unzip the files
if [[ ${uncompress} == "true" ]]; then
	cd ${my_path}
	printf "About to uncompress the files...\n" >&1
	gunzip *.gz
	printf "All files have been uncompressed\n" >&1
	exit 0
else
	exit 1
fi
##### All done
	#!/usr/bin/env bash

	# MIT License
	#
	# Copyright (c) [2018] [Grupo de Bioquimica Experimental y Computacional from the Pontificia Universidad Javeriana and Tain Velasco-Luquez]
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.

	# Author:
	#Tain Velasco-Luquez <tvelasco@javeriana.edu.co> @TainVelasco-Luquez: Design and implementation

	# Last updated 12/07/2018

	##### Usage
	usage() {
	printf "
	$(basename "$0") -- Program to download the human reference genome (GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz), the annotation file for that human reference genome (GCA_000001405.15_GRCh38_full_analysis_set.refseq_annotation.gff.gz) and also downloads all .fastq files from ENA supplied by the user in a text file.

	Optional:
	--help -h Show this help text
	-First_argument Path to save the data
	-uncompress If \"true\" then apply gunzip to all .gz

	Mandatory:
	-Second_argument File with the aspera urls downloaded from ENA
	-Third_argument The max transfer rate over the network (e.g 300m or 600m)
	-Fourth_argument Path to the asperaweb_id_dsa.openssh (e.g. ~/Applications/Aspera\Connect.app/Contents/Resources/asperaweb_id_dsa.openssh)

	Requirements:
	1. This file assumes aspera CLI is installed and its path properly exported
	"
	}

	##### Check for help option
	if [ "$1" == "--help" ] \|\| [ "$1" == "-h" ]; then
	usage
	exit 0
	fi

	##### Check for arguments
	if [ $# -eq 0 ]
	then
	printf "No arguments supplied."
	usage
	exit 1
	fi

	##### Variables
	my_path=$1 # Path to save the database
	asp_url=$2 # File with the aspera url downloaded from ENA
	max_transfer_rate=$3 # Max transfer rate (e.g 300m)
	aspera_path=$4 # Path to the asperaweb_id_dsa.openssh (e.g. ~/Applications/Aspera\ Connect.app/Contents/Resources/asperaweb_id_dsa.openssh)
	uncompress=$5

	# Lets start by downloading the reference genome according to the instructions in RNA_processing.md
	printf "Downloading the reference genome GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz \n"
	rsync -avP rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz "${my_path}"
	printf "\n DONE \n"

	# Now lets download the annotation file for the same version and from the same source as the reference genome, avoiding in this way incomaptibilities
	printf "Downloading the reference annotation GCA_000001405.15_GRCh38_full_analysis_set.refseq_annotation.gff.gz \n"
	rsync -avP rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_full_analysis_set.refseq_annotation.gff.gz "${my_path}"
	printf "\n DONE \n"

	# After download the file with the aspera url, lets remove the header (first line) and save the file with vim
	vim -c ':1d' -c ':wq' "${asp_url}"
	printf "The number of files to be downloaded from ENA is:"
	wc -l "${asp_url}" # How many fastq files are in the study?

	# Lets download the RNAseq data associated to the study accession number SRP103788
	if [[ $PATH = CLI/bin ]];
	then
	printf "Downloading the files in ${asp_url} \n"
	while read every_line; do
	ascp -QT -l "${max_transfer_rate}" -P33001 -i "${aspera_path}" era-fasp@$every_line "${my_path}"
	printf $every_line " is downloaded"
	done < "${asp_url}";
	printf "All files have been downloaded \n" >&1
	exit 0
	else
	printf "Make sure you export the aspera CLI path. \n" >&2
	exit 1
	fi

	# Finally, lets unzip the files
	if [[ ${uncompress} == "true" ]]; then
	cd ${my_path}
	printf "About to uncompress the files...\n" >&1
	gunzip *.gz
	printf "All files have been uncompressed\n" >&1
	exit 0
	else
	exit 1
	fi
	##### All done