Skip to content

Instantly share code, notes, and snippets.

@odinokov
Last active April 4, 2023 01:25
Show Gist options
  • Save odinokov/bb2f917dc11f77acff72b583a9239584 to your computer and use it in GitHub Desktop.
Save odinokov/bb2f917dc11f77acff72b583a9239584 to your computer and use it in GitHub Desktop.
Downsample and computeGCBias
#!/bin/bash
# This snippet downsamples BAM, removes blacklisted regions, and calculates GC bias for autosomes.
# mamba install -y -c bioconda -c conda-forge samtools bedtools deeptools pv
# Resulting columns (from https://www.biostars.org/p/447062/):
# N_gc: The number of reads with a given GC content
# F_gc: The number of reads spanning regions with a given GC content
# R_gc: The scaled ratio between the above values
# The number of rows should be equal to 1 plus the estimated median fragment length.
set -euo pipefail
# Define software dependencies
declare -ra deps=("samtools" "bedtools" "computeGCBias" "pv")
# Check that required software dependencies are installed
for dep in "${deps[@]}"
do
command -v "$dep" >/dev/null 2>&1 || { echo >&2 "Error: $dep is required but not installed. Aborting."; exit 1; }
done
# Define input and output file paths
input_bam="$1"
output_prefix="$(basename "$input_bam" .bam | sed -E 's@(rmduplic-|merged-|-sorted|_P|_merge|-ready)@@g')"
# https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.2bit
# https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/hg38-blacklist.v2.bed.gz
genome="/mnt/sdd/data/db/hg38/hg38.2bit"
blacklist="/mnt/sdd/data/db/hg38/hg38-blacklist.v2.bed.gz"
# Define the number of CPUs to use
num_cpus=4
# Define BAM fraction
frac=0.05
# Define functions
# Check that the input BAM file exists
check_input_bam() {
if [ ! -f "${input_bam}" ]; then
echo "Error: file ${input_bam} not found"
exit 1
fi
}
# Filter and downsample the input BAM file
filter_downsample() {
filesize=$(du -sb "${input_bam}" | awk '{printf "%.0f", $1 * "${frac}"}')
echo "Downsampling..."
samtools view -@ "${num_cpus}" -s "${frac}" -b -h -q 5 -f 3 -F 3852 -G 48 --incl-flags 48 \
"${input_bam}" chr{1..22} \
| pv -s "${filesize}" \
| bedtools intersect -abam - -b "${blacklist}" -v \
> "${output_prefix}.tmp.bam"
}
# Compute GC bias
compute_gc_bias() {
echo "Computing GC bias..."
computeGCBias \
--numberOfProcessors "${num_cpus}" \
--bamfile "${output_prefix}.tmp.bam" \
--effectiveGenomeSize 2688205822 \
--genome "${genome}" \
--GCbiasFrequenciesFile "${output_prefix}.gc_bias.txt" \
--biasPlot "${output_prefix}.gc_bias.png" \
--plotFileFormat png
}
# Index the output BAM file
index_bam() {
echo "Indexing..."
filesize=$(du -sb "${output_prefix}.tmp.bam" | cut -f1)
cat "${output_prefix}.tmp.bam" | pv -s "${filesize}" | samtools index -@ "${num_cpus}" - -o "${output_prefix}.tmp.bam".bai
}
# Remove temporary files
cleanup() {
rm -f "${output_prefix}.tmp.bam" "${output_prefix}.tmp.bam.bai"
}
# Main function
main() {
check_input_bam
filter_downsample
index_bam
compute_gc_bias
cleanup
echo "Done!"
}
# Run main function
trap cleanup EXIT
main "$@"
exit 0
# how to calculate effectiveGenomeSize
# index a genome
# samtools faidx /mnt/sdd/data/db/hg38/hg38.fa
# select chromosomes or regions
# samtools faidx /mnt/sdd/data/db/hg38/hg38.fa chr{1..22} | faCount /dev/stdin
# total 2875001522
# calculate the size of blacklisted regions
# zcat /mnt/sdd/data/db/hg38/hg38-blacklist.v2.bed.gz \
# | grep -w '^#\|chr[1-9]\|chr[1-2][0-9]' \
# | awk '{ sum += ($3-$2); n++ } END { if (n > 0) print sum, n, NR;}'
# 186795700 486 486
# effectiveGenomeSize:
# 2875001522 - 186795700 = 2688205822
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment