Created
November 11, 2023 17:39
-
-
Save JamesKane/974c137a877e591c82929891392c7e53 to your computer and use it in GitHub Desktop.
A script template for macOS for aligning short-reads with minimap2 from FASTQ files using varying references.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Usage: mm2_align.sh [b37|b38|chm13] [illumina|cg] | |
# | |
# A simple script to align short-read WGS FASTQ files to a CRAM for a target build reference | |
# on macOS with Homebrew installed. There are four major assumptions: | |
# 1) The script is run from a work path where the directory name is the Sample ID | |
# 2) The system Library contains a Genomics folder, which it can read/write a reference file | |
# 3) The system has adequate memory to allocate 1GB of ram per CPU core | |
# 4) The source FASTQ files are named [SAMPLE]/[SAMPLE]_[1|2].fastq.gz | |
# Exit when any command fails | |
set -e | |
# Ensure macOS is setup with the tools needed via Homebrew (https://brew.sh) | |
if ! [ -x "$(command -v brew)" ]; then | |
echo "Please install Homebrew (https://brew.sh)" | |
exit 1 | |
fi | |
if ! [ -x "$(command -v minimap2)" ]; then | |
echo "Installing minimap2" | |
brew install minimap2 | |
fi | |
if ! [ -x "$(command -v samtools)" ]; then | |
echo "Installing samtools" | |
brew install samtools | |
fi | |
if ! [ -x "$(command -v wget)" ]; then | |
echo "Installing wget" | |
brew install wget | |
fi | |
MAX_CORES=$(sysctl -n hw.ncpu) | |
echo 'Max Cores: '${MAX_CORES} | |
# Estimate the number of cores to use for sorting based on available RAM to control swap | |
MAX_RAM=$(sysctl -n hw.memsize) | |
SORT_CORES=$(((($MAX_RAM / 1024 / 1024 / 1024) - $MAX_CORES) / 4 - 1)) | |
echo 'Sort Cores: '$SORT_CORES | |
if [[ $1 == "chm13" ]]; then | |
reference=/Library/Genomics/Reference/chm13v2.0/chm13v2.0.fa.gz | |
source_ref=https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz | |
elif [[ $1 == "b38" ]]; then | |
reference=/Library/Genomics/Reference/b38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | |
source_ref=ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | |
elif [[ $1 == "b37" ]]; then | |
reference=/Library/Genomics/Reference/b37/human_g1k_v37.fasta.gz | |
source_ref=ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/human_g1k_v37.fasta.gz | |
else | |
echo "Unknown build or missing build: "$1 | |
exit 1 | |
fi | |
echo 'Reference: '$reference | |
if ! [ -f $reference ]; then | |
echo 'Fetching missing reference from server: '$1 | |
wget -O $reference $source_ref | |
fi | |
if ! [[ -n $2 ]]; then | |
echo "Must supply the sequencing platform: illumina | cg (CGI/MGI sequencer)" | |
exit 1 | |
fi | |
sample=${PWD##*/} | |
target=${sample}.mm2.$1.cram | |
# TODO: Add a better way to define the platform via looking at the reads in the fastq | |
READ_GROUP='@RG\tID:'${sample}'\tSM:'${sample}'\tLB:lib1\tPL:'${2}'\tPU:unit1' | |
minimap2 -ax sr -t $MAX_CORES -R $READ_GROUP ${reference} ${sample}_?.fastq.gz \ | |
| samtools fixmate -u -m - - \ | |
| samtools sort -@ $SORT_CORES -m4g -T /tmp - \ | |
| samtools markdup -@ $MAX_CORES --reference ${reference} - ${target} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment