nathanhaigh/sra_process.sh

## sra_process.sh
#!/bin/bash
# Bash script to download a bunch of *.sra files from the NCBI SRA, using
# the aspera client, and extract FASTQ data using the SRA Toolkit.

max_bandwidth_mbps=50

# These SRA files are for the durum genome
files=(
  'SRR567512.sra'
  'SRR567559.sra'
  'SRR567563.sra'
  'SRR570310.sra'
  'SRR567544.sra'
  'SRR567549.sra'
  'SRR567552.sra'
)

for file in "${files[@]}"; do
  echo "${file}"
~/.aspera/connect/bin/ascp -i ~/.aspera/connect/etc/asperaweb_id_dsa.putty -k1 -QTr -l${max_bandwidth_mbps}m anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/${file:0:3}/${file:0:6}/${file%.sra}/${file} ./
  if [[ ! -e ${file%.sra}.aspx && ! -e ${file%.sra}.fastq.gz ]]; then
    # only process sra file that have completed downloading (i.e. no *.aspx file) and for which no fastq.gz file exists
    echo -n "  Extracting data into FASTQ format ... "
    # Convert SRA to FASTQ and change the formatting of the output to reduce disk space and be consistent with normall Illumina read naming i.e. /1 and /2 suffixes
    /home/nhaigh/bioinf/sratoolkit.2.2.2a-ubuntu32/bin/fastq-dump --split-spot --stdout --readids --defline-seq '@$ac.$si/$ri' --defline-qual '+' ${file} | pigz --best --processes 10 > ${file%.sra}.fastq.gz
    echo "DONE"
  else
    echo "  Skipping"
  fi
done
	#!/bin/bash
	# Bash script to download a bunch of *.sra files from the NCBI SRA, using
	# the aspera client, and extract FASTQ data using the SRA Toolkit.

	max_bandwidth_mbps=50

	# These SRA files are for the durum genome
	files=(
	'SRR567512.sra'
	'SRR567559.sra'
	'SRR567563.sra'
	'SRR570310.sra'
	'SRR567544.sra'
	'SRR567549.sra'
	'SRR567552.sra'
	)

	for file in "${files[@]}"; do
	echo "${file}"
	~/.aspera/connect/bin/ascp -i ~/.aspera/connect/etc/asperaweb_id_dsa.putty -k1 -QTr -l${max_bandwidth_mbps}m anonftp@ftp-trace.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra/${file:0:3}/${file:0:6}/${file%.sra}/${file} ./
	if [[ ! -e ${file%.sra}.aspx && ! -e ${file%.sra}.fastq.gz ]]; then
	# only process sra file that have completed downloading (i.e. no *.aspx file) and for which no fastq.gz file exists
	echo -n " Extracting data into FASTQ format ... "
	# Convert SRA to FASTQ and change the formatting of the output to reduce disk space and be consistent with normall Illumina read naming i.e. /1 and /2 suffixes
	/home/nhaigh/bioinf/sratoolkit.2.2.2a-ubuntu32/bin/fastq-dump --split-spot --stdout --readids --defline-seq '@$ac.$si/$ri' --defline-qual '+' ${file} \| pigz --best --processes 10 > ${file%.sra}.fastq.gz
	echo "DONE"
	else
	echo " Skipping"
	fi
	done