Instantly share code, notes, and snippets.

View build_cohort.rb
# Very basic Ruby script that collects all the gVCFs in a directory, and puts the results
# into a GenomicsDB for later genotyping. The batch size is limited to 200 files at a time
# since memory usage is quite demanding. This currently consumes 18GB of RAM on a Fedora 28
# workstation. Reader threads does not appear to have significant impact.
# TODO: Parameterize the contig, since GenomicsDBImport doesn't support multiple
# chromosomes at present.
command = "gatk --java-options \"-Xmx32g -Xms32g\" GenomicsDBImport \\\n"
command += "-R /mnt/genomics/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa \\\n"
View fastq_to_sam.sh
# USAGE: sh fastq_to_sam.sh <fastq1> <fastq2> <sample_name> <read_group> <platform_unit>
gatk=~/Genomics/gatk-4.0.4.0/gatk
$gatk --java-options "-Xmx8G" FastqToSam \
-FASTQ=$1 \
-FASTQ2=$2 \
-OUTPUT=$3.unmapped.bam \
-READ_GROUP_NAME=$4 \
-SAMPLE_NAME=$3 \
-LIBRARY_NAME=$3 \
-PLATFORM_UNIT=$5 \
View prepare_gvcf.sh
# USAGE: sh prepare_gvcf.sh <sample name>
# CONFIG VARIABLES: Update to match environment
gatk=~/Genomics/gatk-4.0.4.0/gatk
reference=~/Genomics/Reference/GRCh38/GRCh38_full_analysis_set_plus_decoy_hla.fa
known=~/Genomics/Reference/GRCh38/Mills_and_1000G_gold_standard.indels.b38.primary_assembly.vcf.gz
snpdb=~/Genomics/Reference/GRCh38/ALL_20141222.dbSNP142_human_GRCh38.snps.vcf
$gatk --java-options "-Xmx4G" \
MarkDuplicates -I=$1.bwa.clean.bam -O=$1.dedup.bam -METRICS_FILE=metrics.txt
View create_clean_bam.sh
# USAGE: sh create_clean_bam.sh <sample name>
# Based on https://software.broadinstitute.org/gatk/documentation/article.php?id=6483
# CONFIG VARIABLES: Update to match environment
gatk=~/Genomics/gatk-4.0.4.0/gatk
reference=~/Genomics/Reference/GRCh38_full_analysis_set_plus_decoy_hla.fa
tmp_dir=/Volumes/External/tmp
# Mark the Illumina adapters (if present. The sequencing lab should have removed them
# prior to delivering the results.)
$gatk --java-options "-Xmx8G" MarkIlluminaAdapters \
View revert_bam.sh
# USAGE: sh revert_bam.sh <sample name>
# Assumes GATK is on the path. Based on https://gatkforums.broadinstitute.org/gatk/discussion/6484#latest%23top
gatk RevertSam \
-I=$1.bam \
-O=$1.unmapped.bam\
-SANITIZE=true \
-MAX_DISCARD_FRACTION=0.005 \
-ATTRIBUTE_TO_CLEAR=XT \
-ATTRIBUTE_TO_CLEAR=XN \
-ATTRIBUTE_TO_CLEAR=AS \