James Kane JamesKane

## mm2_align.sh
#!/bin/bash

# Usage: mm2_align.sh [b37|b38|chm13] [illumina|cg]
#
# A simple script to align short-read WGS FASTQ files to a CRAM for a target build reference
# on macOS with Homebrew installed.  There are four major assumptions:
# 1) The script is run from a work path where the directory name is the Sample ID
# 2) The system Library contains a Genomics folder, which it can read/write a reference file
# 3) The system has adequate memory to allocate 1GB of ram per CPU core
# 4) The source FASTQ files are named [SAMPLE]/[SAMPLE]_[1|2].fastq.gz

## realign.sh
#!/bin/bash

# USAGE: realign.sh [Source BAM/CRAM]
# ./realign.sh source.GRCh38.bam
#
# The script produces a new BAM aligned on the reference specified in the variable.  Once complete it will apply CallableLoci
# for some quick QC.  The script assumes that the working directory name matches the Sample e.g.
# /mnt/md0/B6564/source.GRCh38.bam
#
# There's a generation of Big Y 500 which do not have the pairs marked correctly.  This results in treating the reads as SE.

## ena_align.sh
#!/bin/bash

# USAGE: sh ena_align.sh
#
# This simple script was originally developed to automate aligning and filtering samples from ENA for ydna-warehouse.org.  It has
# become the default workflow for all NGS data needing to be standardized for keeping samples as consistent as possible from the
# menagerie of D2C vendors.
#
# The script is built on the assumption your FASTQ read data is pre-trimmed and organized with this structure.
# SAMPLE/SAMPLE_[1|2].fastq.gz

## build_cohort.rb
# Very basic Ruby script that collects all the gVCFs in a directory, and puts the results
# into a GenomicsDB for later genotyping.  The batch size is limited to 200 files at a time
# since memory usage is quite demanding.  This currently consumes 18GB of RAM on a Fedora 28
# workstation.  Reader threads does not appear to have significant impact.

# TODO: Parameterize the contig, since GenomicsDBImport doesn't support multiple
# chromosomes at present.

command = "gatk --java-options \"-Xmx32g -Xms32g\" GenomicsDBImport \\\n"
command += "-R /mnt/genomics/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa \\\n"

## fastq_to_sam.sh
# USAGE: sh fastq_to_sam.sh <fastq1> <fastq2> <sample_name> <read_group> <platform_unit>
gatk=~/Genomics/gatk-4.0.4.0/gatk
$gatk --java-options "-Xmx8G" FastqToSam \
    -FASTQ=$1 \
    -FASTQ2=$2 \
    -OUTPUT=$3.unmapped.bam \
    -READ_GROUP_NAME=$4 \
    -SAMPLE_NAME=$3 \
    -LIBRARY_NAME=$3 \
    -PLATFORM_UNIT=$5 \

## prepare_gvcf.sh
# USAGE:  sh prepare_gvcf.sh <sample name>
# CONFIG VARIABLES:  Update to match environment
gatk=~/Genomics/gatk-4.0.4.0/gatk
reference=~/Genomics/Reference/GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa
known=~/Genomics/Reference/GRCh38/Mills_and_1000G_gold_standard.indels.b38.primary_assembly.vcf.gz
snpdb=~/Genomics/Reference/GRCh38/ALL_20141222.dbSNP142_human_GRCh38.snps.vcf

$gatk --java-options "-Xmx4G" \
	MarkDuplicates -I=$1.bwa.clean.bam -O=$1.dedup.bam -METRICS_FILE=metrics.txt


## create_clean_bam.sh
# USAGE:  sh create_clean_bam.sh <sample name>
# Based on https://software.broadinstitute.org/gatk/documentation/article.php?id=6483
# CONFIG VARIABLES:  Update to match environment
gatk=~/Genomics/gatk-4.0.4.0/gatk
reference=~/Genomics/Reference/GRCh38_full_analysis_set_plus_decoy_hla.fa
tmp_dir=/Volumes/External/tmp

# Mark the Illumina adapters (if present.  The sequencing lab should have removed them
# prior to delivering the results.)
$gatk --java-options "-Xmx8G" MarkIlluminaAdapters \

## revert_bam.sh
# USAGE:  sh revert_bam.sh <sample name>
# Assumes GATK is on the path.  Based on https://gatkforums.broadinstitute.org/gatk/discussion/6484#latest%23top
gatk RevertSam \
    -I=$1.bam \
    -O=$1.unmapped.bam\
    -SANITIZE=true \
    -MAX_DISCARD_FRACTION=0.005 \
    -ATTRIBUTE_TO_CLEAR=XT \
    -ATTRIBUTE_TO_CLEAR=XN \
    -ATTRIBUTE_TO_CLEAR=AS \
	#!/bin/bash

	# Usage: mm2_align.sh [b37\|b38\|chm13] [illumina\|cg]
	#
	# A simple script to align short-read WGS FASTQ files to a CRAM for a target build reference
	# on macOS with Homebrew installed. There are four major assumptions:
	# 1) The script is run from a work path where the directory name is the Sample ID
	# 2) The system Library contains a Genomics folder, which it can read/write a reference file
	# 3) The system has adequate memory to allocate 1GB of ram per CPU core
	# 4) The source FASTQ files are named [SAMPLE]/[SAMPLE]_[1\|2].fastq.gz
	#!/bin/bash

	# USAGE: realign.sh [Source BAM/CRAM]
	# ./realign.sh source.GRCh38.bam
	#
	# The script produces a new BAM aligned on the reference specified in the variable. Once complete it will apply CallableLoci
	# for some quick QC. The script assumes that the working directory name matches the Sample e.g.
	# /mnt/md0/B6564/source.GRCh38.bam
	#
	# There's a generation of Big Y 500 which do not have the pairs marked correctly. This results in treating the reads as SE.
	#!/bin/bash

	# USAGE: sh ena_align.sh
	#
	# This simple script was originally developed to automate aligning and filtering samples from ENA for ydna-warehouse.org. It has
	# become the default workflow for all NGS data needing to be standardized for keeping samples as consistent as possible from the
	# menagerie of D2C vendors.
	#
	# The script is built on the assumption your FASTQ read data is pre-trimmed and organized with this structure.
	# SAMPLE/SAMPLE_[1\|2].fastq.gz
	# Very basic Ruby script that collects all the gVCFs in a directory, and puts the results
	# into a GenomicsDB for later genotyping. The batch size is limited to 200 files at a time
	# since memory usage is quite demanding. This currently consumes 18GB of RAM on a Fedora 28
	# workstation. Reader threads does not appear to have significant impact.

	# TODO: Parameterize the contig, since GenomicsDBImport doesn't support multiple
	# chromosomes at present.

	command = "gatk --java-options \"-Xmx32g -Xms32g\" GenomicsDBImport \\\n"
	command += "-R /mnt/genomics/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa \\\n"
	# USAGE: sh fastq_to_sam.sh <fastq1> <fastq2> <sample_name> <read_group> <platform_unit>
	gatk=~/Genomics/gatk-4.0.4.0/gatk
	$gatk --java-options "-Xmx8G" FastqToSam \
	-FASTQ=$1 \
	-FASTQ2=$2 \
	-OUTPUT=$3.unmapped.bam \
	-READ_GROUP_NAME=$4 \
	-SAMPLE_NAME=$3 \
	-LIBRARY_NAME=$3 \
	-PLATFORM_UNIT=$5 \
	# USAGE: sh prepare_gvcf.sh <sample name>
	# CONFIG VARIABLES: Update to match environment
	gatk=~/Genomics/gatk-4.0.4.0/gatk
	reference=~/Genomics/Reference/GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa
	known=~/Genomics/Reference/GRCh38/Mills_and_1000G_gold_standard.indels.b38.primary_assembly.vcf.gz
	snpdb=~/Genomics/Reference/GRCh38/ALL_20141222.dbSNP142_human_GRCh38.snps.vcf

	$gatk --java-options "-Xmx4G" \
	MarkDuplicates -I=$1.bwa.clean.bam -O=$1.dedup.bam -METRICS_FILE=metrics.txt
	# USAGE: sh create_clean_bam.sh <sample name>
	# Based on https://software.broadinstitute.org/gatk/documentation/article.php?id=6483
	# CONFIG VARIABLES: Update to match environment
	gatk=~/Genomics/gatk-4.0.4.0/gatk
	reference=~/Genomics/Reference/GRCh38_full_analysis_set_plus_decoy_hla.fa
	tmp_dir=/Volumes/External/tmp

	# Mark the Illumina adapters (if present. The sequencing lab should have removed them
	# prior to delivering the results.)
	$gatk --java-options "-Xmx8G" MarkIlluminaAdapters \
	# USAGE: sh revert_bam.sh <sample name>
	# Assumes GATK is on the path. Based on https://gatkforums.broadinstitute.org/gatk/discussion/6484#latest%23top
	gatk RevertSam \
	-I=$1.bam \
	-O=$1.unmapped.bam\
	-SANITIZE=true \
	-MAX_DISCARD_FRACTION=0.005 \
	-ATTRIBUTE_TO_CLEAR=XT \
	-ATTRIBUTE_TO_CLEAR=XN \
	-ATTRIBUTE_TO_CLEAR=AS \