edawson/v3.7_bam2fq_fq2bam

## v3.7_bam2fq_fq2bam

#############
## Download the 30X hg19-aligned bam from Google's public sequencing of HG002
## and the respective BAI file.
#############

wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam
wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam.bai


#############
## Prepare the references so we can realign reads
#############

## Download the original hg19 / hsd37d5 reference
## and create and FAI index
wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
gunzip hs37d5.fa.gz
samtools faidx hs37d5.fa

## Download GRCh38
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

## Make a .fai index using samtools faidx
samtools faidx GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

## Create the BWA indices
bwa index GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

## Download the Gold Standard indels from 1kg to use as your known-sites file.
wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz

## Also grab the tabix index for the file
wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi


############
## Run the bam2fq tool to extract reads from the BAM file
## Adjust the --num-threads argument to reflect the number of cores on your system.
## With 8 GPUs and 64 vCPUs this should take ~45 minutes.
############
time pbrun bam2fq \
--ref hs37d5.fa \
--in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam \
--out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq \
--num-threads 64


##############
## Run the fq2bam tool to align reads to GRCh38
##############
time pbrun fq2bam \
--in-fq HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz \
--ref Homo_sapiens_assembly38.fasta \
--knownSites Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
--out-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
--out-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt

	#############
	## Download the 30X hg19-aligned bam from Google's public sequencing of HG002
	## and the respective BAI file.
	#############

	wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam
	wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam.bai


	#############
	## Prepare the references so we can realign reads
	#############

	## Download the original hg19 / hsd37d5 reference
	## and create and FAI index
	wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
	gunzip hs37d5.fa.gz
	samtools faidx hs37d5.fa

	## Download GRCh38
	wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
	gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

	## Make a .fai index using samtools faidx
	samtools faidx GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

	## Create the BWA indices
	bwa index GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

	## Download the Gold Standard indels from 1kg to use as your known-sites file.
	wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz

	## Also grab the tabix index for the file
	wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi


	############
	## Run the bam2fq tool to extract reads from the BAM file
	## Adjust the --num-threads argument to reflect the number of cores on your system.
	## With 8 GPUs and 64 vCPUs this should take ~45 minutes.
	############
	time pbrun bam2fq \
	--ref hs37d5.fa \
	--in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam \
	--out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq \
	--num-threads 64


	##############
	## Run the fq2bam tool to align reads to GRCh38
	##############
	time pbrun fq2bam \
	--in-fq HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz \
	--ref Homo_sapiens_assembly38.fasta \
	--knownSites Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
	--out-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
	--out-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt