ckandoth/ngs_test_data.sh

## ngs_test_data.sh
# GOAL: Create a small test dataset for CI/CD of cancer bioinformatics tools/pipelines
# Prerequisites: A clean Linux environment with a working internet connection

# Download and install mambaforge into a folder under your home directory:
curl -L https://github.com/conda-forge/miniforge/releases/download/4.14.0-0/Mambaforge-Linux-x86_64.sh -o /tmp/mambaforge.sh
sh /tmp/mambaforge.sh -bfp $HOME/mambaforge && rm -f /tmp/mambaforge.sh

# Add the following to your ~/.bashrc file to activate the base environment whenever you login:
if [ -f "$HOME/mambaforge/etc/profile.d/conda.sh" ]; then
    . $HOME/mambaforge/etc/profile.d/conda.sh
    conda activate
fi

# Logout and login to activate the base enivironment, and then install the required packages:
mamba install -y -c bioconda htslib==1.13 samtools==1.13 picard-slim==2.25.7 nextflow==21.04.0 singularity==3.7.1

# bs (BaseSpace CLI) cannot be installed using mamba, so let's do it manually:
curl -L https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs -o $HOME/mambaforge/bin/bs
chmod +x $HOME/mambaforge/bin/bs

# Visit https://basespace.illumina.com and login. Create a free account if needed. Then visit https://basespace.illumina.com/s/RJSCrml92sQt
# and click "Accept" to add the free demo WGS data to your BaseSpace account. Run "bs auth" to login to your command-line interface (CLI).

# Use the BaseSpace CLI to download the GRCh38 WGS BAMs for TN-pairs HCC1187C and HCC1187BL:
mkdir data
bs list dataset --terse --is-type "illumina.isaac.v8.1" --project-name "HiSeq 2000: Tumor Normal WGS (HCC1187 & HCC2218)" |\
    xargs -L1 bs contents dataset --terse --extension bam,bai --id |\
    xargs -L1 bs download file --no-metadata -o data --id

# Make smaller BAMs limited to 2Mbp around RUNX1 and sorted by read name:
samtools view -@8 -h data/HCC1187C_Proband_S1.bam chr21:34000000-36000000 | samtools sort -@8 -n -o data/chr21_tum_wgs.bam
samtools view -@8 -h data/HCC1187BL_Proband_S1.bam chr21:34000000-36000000 | samtools sort -@8 -n -o data/chr21_nrm_wgs.bam

# Unpack paired-end reads into separate FASTQs per read-group:
mkdir tum nrm
picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --INPUT data/chr21_tum_wgs.bam --RG_TAG PU --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR tum
picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --INPUT data/chr21_nrm_wgs.bam --RG_TAG PU --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR nrm

# Rename FASTQs to follow Illumina/Casava naming scheme:
# [SAMPLE NAME]_[BARCODE SEQUENCE]_L[LANE NUMBER (0-padded to 3 digits)]_R[READ NUMBER (either 1 or 2)]_[SET NUMBER (0-padded to 3 digits)].fastq.gz
ls tum/*.fastq.gz | perl -ne 'chomp; ($f,$l,$r)=m/(\w+).(\d)\w+(\d)/; print "mv $_ tum/tum_$f\_L00$l\_R$r\_001.fastq.gz\n"' | bash
ls nrm/*.fastq.gz | perl -ne 'chomp; ($f,$l,$r)=m/(\w+).(\d)\w+(\d)/; print "mv $_ nrm/nrm_$f\_L00$l\_R$r\_001.fastq.gz\n"' | bash

# The tumor has 8 lanes of reads and the normal has 4 lanes. Downsample them to 2 lanes for tumor and 1 lane for normal:
rm nrm/nrm_C0JD1ACXX_L00{2,3,4}* tum/tum_C0G5BACXX_L00{3,4,5,6,7,8}*

# Create a tab-delimited manifest of these FASTQs compatible with https://nf-co.re/sarek:
echo -e "HCC1187\tXX\t0\tnrm\tC0JD1ACXX.1\tnrm/nrm_C0JD1ACXX_L001_R1_001.fastq.gz\tnrm/nrm_C0JD1ACXX_L001_R2_001.fastq.gz" > tum_nrm_wgs.tsv
echo -e "HCC1187\tXX\t1\ttum\tC0G5BACXX.1\ttum/tum_C0G5BACXX_L001_R1_001.fastq.gz\ttum/tum_C0G5BACXX_L001_R2_001.fastq.gz" >> tum_nrm_wgs.tsv
echo -e "HCC1187\tXX\t1\ttum\tC0G5BACXX.2\ttum/tum_C0G5BACXX_L002_R1_001.fastq.gz\ttum/tum_C0G5BACXX_L002_R2_001.fastq.gz" >> tum_nrm_wgs.tsv

# Process this data using the nextflow Sarek pipeline which also downloads a GRCh38 resource bundle for us:
nextflow run -profile singularity -w data/nf -r 2.7.1 nf-core/sarek --genome GRCh38 --no_intervals --skip_qc all --tools FreeBayes --input tum_nrm_wgs.tsv --outdir data/sarek

# Find the GRCh38 reference, subset to chr21 and an unplaced contig with many mapped reads, and index:
mkdir ref
samtools faidx -o ref/GRCh38_chr21.fa $(find data/nf/stage -type f -name Homo_sapiens_assembly38.fasta) chr21 chrUn_GL000220v1
samtools faidx ref/GRCh38_chr21.fa
picard CreateSequenceDictionary --REFERENCE ref/GRCh38_chr21.fa

# Find the SNP/Indel reference VCFs, subset to chr21, and tabix index them:
bgzip -dc $(find data/nf/stage -type f -name dbsnp_146.hg38.vcf.gz) | grep -E "^#|^chr21" | bgzip -c > ref/dbsnp_146_GRCh38_chr21.vcf.gz
tabix -p vcf ref/dbsnp_146_GRCh38_chr21.vcf.gz
bgzip -dc $(find data/nf/stage -type f -name Homo_sapiens_assembly38.known_indels.vcf.gz) | grep -E "^#|^chr21" | bgzip -c > ref/known_indels_GRCh38_chr21.vcf.gz
tabix -p vcf ref/known_indels_GRCh38_chr21.vcf.gz
bgzip -dc $(find data/nf/stage -type f -name Mills_and_1000G_gold_standard.indels.hg38.vcf.gz) | grep -E "^#|^chr21" | bgzip -c > ref/gold_standard_indels_GRCh38_chr21.vcf.gz
tabix -p vcf ref/gold_standard_indels_GRCh38_chr21.vcf.gz

# Move the Sarek BAMs/VCFs and their indexes into new subfolders:
mkdir bam vcf
find data/sarek -type f -name *.recal.bam -exec mv {} bam/ \;
find data/sarek -type f -name *.recal.bam.bai -exec mv {} bam/ \;
find data/sarek -type f -name *.vcf.gz -exec mv {} vcf/ \;
find data/sarek -type f -name *.vcf.gz.tbi -exec mv {} vcf/ \;

# Combine test dataset into a tar and upload for sharing:
tar -cf test_tum_nrm_wgs.tar bam nrm ref tum vcf tum_nrm_wgs.tsv readme.txt
scp test_tum_nrm_wgs.tar data.cyri.ac:/var/www/cyri.ac/html/

# Separately upload one set of tumor FASTQs for easy access:
scp tum/tum_C0G5BACXX_L001_R1_001.fastq.gz data.cyri.ac:/var/www/cyri.ac/html/test_L001_R1_001.fastq.gz
scp tum/tum_C0G5BACXX_L001_R2_001.fastq.gz data.cyri.ac:/var/www/cyri.ac/html/test_L001_R2_001.fastq.gz

# These test data should now be easily downlodable from https://data.cyri.ac
	# GOAL: Create a small test dataset for CI/CD of cancer bioinformatics tools/pipelines
	# Prerequisites: A clean Linux environment with a working internet connection

	# Download and install mambaforge into a folder under your home directory:
	curl -L https://github.com/conda-forge/miniforge/releases/download/4.14.0-0/Mambaforge-Linux-x86_64.sh -o /tmp/mambaforge.sh
	sh /tmp/mambaforge.sh -bfp $HOME/mambaforge && rm -f /tmp/mambaforge.sh

	# Add the following to your ~/.bashrc file to activate the base environment whenever you login:
	if [ -f "$HOME/mambaforge/etc/profile.d/conda.sh" ]; then
	. $HOME/mambaforge/etc/profile.d/conda.sh
	conda activate
	fi

	# Logout and login to activate the base enivironment, and then install the required packages:
	mamba install -y -c bioconda htslib==1.13 samtools==1.13 picard-slim==2.25.7 nextflow==21.04.0 singularity==3.7.1

	# bs (BaseSpace CLI) cannot be installed using mamba, so let's do it manually:
	curl -L https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs -o $HOME/mambaforge/bin/bs
	chmod +x $HOME/mambaforge/bin/bs

	# Visit https://basespace.illumina.com and login. Create a free account if needed. Then visit https://basespace.illumina.com/s/RJSCrml92sQt
	# and click "Accept" to add the free demo WGS data to your BaseSpace account. Run "bs auth" to login to your command-line interface (CLI).

	# Use the BaseSpace CLI to download the GRCh38 WGS BAMs for TN-pairs HCC1187C and HCC1187BL:
	mkdir data
	bs list dataset --terse --is-type "illumina.isaac.v8.1" --project-name "HiSeq 2000: Tumor Normal WGS (HCC1187 & HCC2218)" \|\
	xargs -L1 bs contents dataset --terse --extension bam,bai --id \|\
	xargs -L1 bs download file --no-metadata -o data --id

	# Make smaller BAMs limited to 2Mbp around RUNX1 and sorted by read name:
	samtools view -@8 -h data/HCC1187C_Proband_S1.bam chr21:34000000-36000000 \| samtools sort -@8 -n -o data/chr21_tum_wgs.bam
	samtools view -@8 -h data/HCC1187BL_Proband_S1.bam chr21:34000000-36000000 \| samtools sort -@8 -n -o data/chr21_nrm_wgs.bam

	# Unpack paired-end reads into separate FASTQs per read-group:
	mkdir tum nrm
	picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --INPUT data/chr21_tum_wgs.bam --RG_TAG PU --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR tum
	picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --INPUT data/chr21_nrm_wgs.bam --RG_TAG PU --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR nrm

	# Rename FASTQs to follow Illumina/Casava naming scheme:
	# [SAMPLE NAME]_[BARCODE SEQUENCE]_L[LANE NUMBER (0-padded to 3 digits)]_R[READ NUMBER (either 1 or 2)]_[SET NUMBER (0-padded to 3 digits)].fastq.gz
	ls tum/*.fastq.gz \| perl -ne 'chomp; ($f,$l,$r)=m/(\w+).(\d)\w+(\d)/; print "mv $_ tum/tum_$f\_L00$l\_R$r\_001.fastq.gz\n"' \| bash
	ls nrm/*.fastq.gz \| perl -ne 'chomp; ($f,$l,$r)=m/(\w+).(\d)\w+(\d)/; print "mv $_ nrm/nrm_$f\_L00$l\_R$r\_001.fastq.gz\n"' \| bash

	# The tumor has 8 lanes of reads and the normal has 4 lanes. Downsample them to 2 lanes for tumor and 1 lane for normal:
	rm nrm/nrm_C0JD1ACXX_L00{2,3,4}* tum/tum_C0G5BACXX_L00{3,4,5,6,7,8}*

	# Create a tab-delimited manifest of these FASTQs compatible with https://nf-co.re/sarek:
	echo -e "HCC1187\tXX\t0\tnrm\tC0JD1ACXX.1\tnrm/nrm_C0JD1ACXX_L001_R1_001.fastq.gz\tnrm/nrm_C0JD1ACXX_L001_R2_001.fastq.gz" > tum_nrm_wgs.tsv
	echo -e "HCC1187\tXX\t1\ttum\tC0G5BACXX.1\ttum/tum_C0G5BACXX_L001_R1_001.fastq.gz\ttum/tum_C0G5BACXX_L001_R2_001.fastq.gz" >> tum_nrm_wgs.tsv
	echo -e "HCC1187\tXX\t1\ttum\tC0G5BACXX.2\ttum/tum_C0G5BACXX_L002_R1_001.fastq.gz\ttum/tum_C0G5BACXX_L002_R2_001.fastq.gz" >> tum_nrm_wgs.tsv

	# Process this data using the nextflow Sarek pipeline which also downloads a GRCh38 resource bundle for us:
	nextflow run -profile singularity -w data/nf -r 2.7.1 nf-core/sarek --genome GRCh38 --no_intervals --skip_qc all --tools FreeBayes --input tum_nrm_wgs.tsv --outdir data/sarek

	# Find the GRCh38 reference, subset to chr21 and an unplaced contig with many mapped reads, and index:
	mkdir ref
	samtools faidx -o ref/GRCh38_chr21.fa $(find data/nf/stage -type f -name Homo_sapiens_assembly38.fasta) chr21 chrUn_GL000220v1
	samtools faidx ref/GRCh38_chr21.fa
	picard CreateSequenceDictionary --REFERENCE ref/GRCh38_chr21.fa

	# Find the SNP/Indel reference VCFs, subset to chr21, and tabix index them:
	bgzip -dc $(find data/nf/stage -type f -name dbsnp_146.hg38.vcf.gz) \| grep -E "^#\|^chr21" \| bgzip -c > ref/dbsnp_146_GRCh38_chr21.vcf.gz
	tabix -p vcf ref/dbsnp_146_GRCh38_chr21.vcf.gz
	bgzip -dc $(find data/nf/stage -type f -name Homo_sapiens_assembly38.known_indels.vcf.gz) \| grep -E "^#\|^chr21" \| bgzip -c > ref/known_indels_GRCh38_chr21.vcf.gz
	tabix -p vcf ref/known_indels_GRCh38_chr21.vcf.gz
	bgzip -dc $(find data/nf/stage -type f -name Mills_and_1000G_gold_standard.indels.hg38.vcf.gz) \| grep -E "^#\|^chr21" \| bgzip -c > ref/gold_standard_indels_GRCh38_chr21.vcf.gz
	tabix -p vcf ref/gold_standard_indels_GRCh38_chr21.vcf.gz

	# Move the Sarek BAMs/VCFs and their indexes into new subfolders:
	mkdir bam vcf
	find data/sarek -type f -name *.recal.bam -exec mv {} bam/ \;
	find data/sarek -type f -name *.recal.bam.bai -exec mv {} bam/ \;
	find data/sarek -type f -name *.vcf.gz -exec mv {} vcf/ \;
	find data/sarek -type f -name *.vcf.gz.tbi -exec mv {} vcf/ \;

	# Combine test dataset into a tar and upload for sharing:
	tar -cf test_tum_nrm_wgs.tar bam nrm ref tum vcf tum_nrm_wgs.tsv readme.txt
	scp test_tum_nrm_wgs.tar data.cyri.ac:/var/www/cyri.ac/html/

	# Separately upload one set of tumor FASTQs for easy access:
	scp tum/tum_C0G5BACXX_L001_R1_001.fastq.gz data.cyri.ac:/var/www/cyri.ac/html/test_L001_R1_001.fastq.gz
	scp tum/tum_C0G5BACXX_L001_R2_001.fastq.gz data.cyri.ac:/var/www/cyri.ac/html/test_L001_R2_001.fastq.gz

	# These test data should now be easily downlodable from https://data.cyri.ac