JamesKane/mm2_align.sh

## mm2_align.sh
#!/bin/bash

# Usage: mm2_align.sh [b37|b38|chm13] [illumina|cg]
#
# A simple script to align short-read WGS FASTQ files to a CRAM for a target build reference
# on macOS with Homebrew installed.  There are four major assumptions:
# 1) The script is run from a work path where the directory name is the Sample ID
# 2) The system Library contains a Genomics folder, which it can read/write a reference file
# 3) The system has adequate memory to allocate 1GB of ram per CPU core
# 4) The source FASTQ files are named [SAMPLE]/[SAMPLE]_[1|2].fastq.gz

# Exit when any command fails
set -e

# Ensure macOS is setup with the tools needed via Homebrew (https://brew.sh)
if ! [ -x "$(command -v brew)" ]; then
	echo "Please install Homebrew (https://brew.sh)"
	exit 1
fi

if ! [ -x "$(command -v minimap2)" ]; then
	echo "Installing minimap2"
	brew install minimap2
fi

if ! [ -x "$(command -v samtools)" ]; then
	echo "Installing samtools"
	brew install samtools
fi

if ! [ -x "$(command -v wget)" ]; then
	echo "Installing wget"
	brew install wget
fi

MAX_CORES=$(sysctl -n hw.ncpu)
echo 'Max Cores: '${MAX_CORES}

# Estimate the number of cores to use for sorting based on available RAM to control swap
MAX_RAM=$(sysctl -n hw.memsize)
SORT_CORES=$(((($MAX_RAM / 1024 / 1024 / 1024) - $MAX_CORES) / 4 - 1))
echo 'Sort Cores: '$SORT_CORES

if [[ $1 == "chm13" ]]; then
	reference=/Library/Genomics/Reference/chm13v2.0/chm13v2.0.fa.gz
	source_ref=https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz
elif [[ $1 == "b38" ]]; then
	reference=/Library/Genomics/Reference/b38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
	source_ref=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
elif [[ $1 == "b37" ]]; then
	reference=/Library/Genomics/Reference/b37/human_g1k_v37.fasta.gz
	source_ref=ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz
else
	echo "Unknown build or missing build: "$1
	exit 1
fi

echo 'Reference: '$reference
if ! [ -f $reference ]; then
	echo 'Fetching missing reference from server: '$1
	wget -O $reference $source_ref
fi

if ! [[ -n $2 ]]; then
	echo "Must supply the sequencing platform: illumina | cg (CGI/MGI sequencer)"
	exit 1
fi

sample=${PWD##*/}
target=${sample}.mm2.$1.cram

# TODO: Add a better way to define the platform via looking at the reads in the fastq
READ_GROUP='@RG\tID:'${sample}'\tSM:'${sample}'\tLB:lib1\tPL:'${2}'\tPU:unit1'

minimap2 -ax sr -t $MAX_CORES -R $READ_GROUP ${reference} ${sample}_?.fastq.gz \
  | samtools fixmate -u -m - - \
  | samtools sort -@ $SORT_CORES -m4g -T /tmp - \
  | samtools markdup -@ $MAX_CORES --reference ${reference} - ${target}
	#!/bin/bash

	# Usage: mm2_align.sh [b37\|b38\|chm13] [illumina\|cg]
	#
	# A simple script to align short-read WGS FASTQ files to a CRAM for a target build reference
	# on macOS with Homebrew installed. There are four major assumptions:
	# 1) The script is run from a work path where the directory name is the Sample ID
	# 2) The system Library contains a Genomics folder, which it can read/write a reference file
	# 3) The system has adequate memory to allocate 1GB of ram per CPU core
	# 4) The source FASTQ files are named [SAMPLE]/[SAMPLE]_[1\|2].fastq.gz

	# Exit when any command fails
	set -e

	# Ensure macOS is setup with the tools needed via Homebrew (https://brew.sh)
	if ! [ -x "$(command -v brew)" ]; then
	echo "Please install Homebrew (https://brew.sh)"
	exit 1
	fi

	if ! [ -x "$(command -v minimap2)" ]; then
	echo "Installing minimap2"
	brew install minimap2
	fi

	if ! [ -x "$(command -v samtools)" ]; then
	echo "Installing samtools"
	brew install samtools
	fi

	if ! [ -x "$(command -v wget)" ]; then
	echo "Installing wget"
	brew install wget
	fi

	MAX_CORES=$(sysctl -n hw.ncpu)
	echo 'Max Cores: '${MAX_CORES}

	# Estimate the number of cores to use for sorting based on available RAM to control swap
	MAX_RAM=$(sysctl -n hw.memsize)
	SORT_CORES=$(((($MAX_RAM / 1024 / 1024 / 1024) - $MAX_CORES) / 4 - 1))
	echo 'Sort Cores: '$SORT_CORES

	if [[ $1 == "chm13" ]]; then
	reference=/Library/Genomics/Reference/chm13v2.0/chm13v2.0.fa.gz
	source_ref=https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz
	elif [[ $1 == "b38" ]]; then
	reference=/Library/Genomics/Reference/b38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
	source_ref=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
	elif [[ $1 == "b37" ]]; then
	reference=/Library/Genomics/Reference/b37/human_g1k_v37.fasta.gz
	source_ref=ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz
	else
	echo "Unknown build or missing build: "$1
	exit 1
	fi

	echo 'Reference: '$reference
	if ! [ -f $reference ]; then
	echo 'Fetching missing reference from server: '$1
	wget -O $reference $source_ref
	fi

	if ! [[ -n $2 ]]; then
	echo "Must supply the sequencing platform: illumina \| cg (CGI/MGI sequencer)"
	exit 1
	fi

	sample=${PWD##*/}
	target=${sample}.mm2.$1.cram

	# TODO: Add a better way to define the platform via looking at the reads in the fastq
	READ_GROUP='@RG\tID:'${sample}'\tSM:'${sample}'\tLB:lib1\tPL:'${2}'\tPU:unit1'

	minimap2 -ax sr -t $MAX_CORES -R $READ_GROUP ${reference} ${sample}_?.fastq.gz \
	\| samtools fixmate -u -m - - \
	\| samtools sort -@ $SORT_CORES -m4g -T /tmp - \
	\| samtools markdup -@ $MAX_CORES --reference ${reference} - ${target}