ckandoth/prep_grch38_ref.txt

## prep_grch38_ref.txt
# Prepare a conda environment with tools we will need:
mamba create -n ref; conda activate ref
mamba install -y -c bioconda htslib==1.13 bcftools==1.13 samtools==1.13 picard-slim==2.26.2 bwa-mem2==2.2.1 bwa==0.7.17 gsutil==4.68

# Fetch the alignment-ready human reference FASTA and index:
gsutil -m cp gs://genomics-public-data/references/GRCh38_Verily/GRCh38_Verily_v1.genome.fa{,.fai} .

# Index the reference FASTA for use with various tools:
picard CreateSequenceDictionary -R GRCh38_Verily_v1.genome.fa
bwa-mem2 index GRCh38_Verily_v1.genome.fa
bwa index GRCh38_Verily_v1.genome.fa

# Fetch the dbSNP VCF, convert RefSeq Accession IDs to chromosome names, and keep only the CLNORIGIN info:
curl -LO https://ftp.ncbi.nih.gov/snp/archive/b155/VCF/GCF_000001405.39.gz
tabix -p vcf GCF_000001405.39.gz
curl -sL https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_report.txt | grep -v ^# | cut -f7,10 | grep -wv na > GCF_000001405.39.acc_ids.txt
bcftools annotate --threads 8 --remove ^INF/CLNORIGIN --rename-chrs GCF_000001405.39.acc_ids.txt --output-type z --output dbsnp_b155_grch38_all.vcf.gz GCF_000001405.39.gz
tabix -p vcf dbsnp_b155_grch38_all.vcf.gz

# Generate a smaller dbSNP VCF listing only SNPs in autosomes, X, Y, and MT:
bcftools view --threads 8 --types snps --regions chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM --output-type z --output-file dbsnp_b155_grch38_snps.vcf.gz dbsnp_b155_grch38_all.vcf.gz
tabix -p vcf dbsnp_b155_grch38_snps.vcf.gz

# Fetch the WES/WGS panel-of-normals generated using GATK on 1000genomes data:
gsutil -m cp gs://gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz{,.tbi} .

# Fetch the WES/WGS gnomAD 2 VCF for use as a germline resource with MuTect2:
gsutil -m cp gs://gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz{,.tbi} .
	# Prepare a conda environment with tools we will need:
	mamba create -n ref; conda activate ref
	mamba install -y -c bioconda htslib==1.13 bcftools==1.13 samtools==1.13 picard-slim==2.26.2 bwa-mem2==2.2.1 bwa==0.7.17 gsutil==4.68

	# Fetch the alignment-ready human reference FASTA and index:
	gsutil -m cp gs://genomics-public-data/references/GRCh38_Verily/GRCh38_Verily_v1.genome.fa{,.fai} .

	# Index the reference FASTA for use with various tools:
	picard CreateSequenceDictionary -R GRCh38_Verily_v1.genome.fa
	bwa-mem2 index GRCh38_Verily_v1.genome.fa
	bwa index GRCh38_Verily_v1.genome.fa

	# Fetch the dbSNP VCF, convert RefSeq Accession IDs to chromosome names, and keep only the CLNORIGIN info:
	curl -LO https://ftp.ncbi.nih.gov/snp/archive/b155/VCF/GCF_000001405.39.gz
	tabix -p vcf GCF_000001405.39.gz
	curl -sL https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_report.txt \| grep -v ^# \| cut -f7,10 \| grep -wv na > GCF_000001405.39.acc_ids.txt
	bcftools annotate --threads 8 --remove ^INF/CLNORIGIN --rename-chrs GCF_000001405.39.acc_ids.txt --output-type z --output dbsnp_b155_grch38_all.vcf.gz GCF_000001405.39.gz
	tabix -p vcf dbsnp_b155_grch38_all.vcf.gz

	# Generate a smaller dbSNP VCF listing only SNPs in autosomes, X, Y, and MT:
	bcftools view --threads 8 --types snps --regions chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chrX,chrY,chrM --output-type z --output-file dbsnp_b155_grch38_snps.vcf.gz dbsnp_b155_grch38_all.vcf.gz
	tabix -p vcf dbsnp_b155_grch38_snps.vcf.gz

	# Fetch the WES/WGS panel-of-normals generated using GATK on 1000genomes data:
	gsutil -m cp gs://gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz{,.tbi} .

	# Fetch the WES/WGS gnomAD 2 VCF for use as a germline resource with MuTect2:
	gsutil -m cp gs://gatk-best-practices/somatic-hg38/af-only-gnomad.hg38.vcf.gz{,.tbi} .