Last active
April 4, 2023 01:25
-
-
Save odinokov/bb2f917dc11f77acff72b583a9239584 to your computer and use it in GitHub Desktop.
Downsample and computeGCBias
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This snippet downsamples BAM, removes blacklisted regions, and calculates GC bias for autosomes. | |
# mamba install -y -c bioconda -c conda-forge samtools bedtools deeptools pv | |
# Resulting columns (from https://www.biostars.org/p/447062/): | |
# N_gc: The number of reads with a given GC content | |
# F_gc: The number of reads spanning regions with a given GC content | |
# R_gc: The scaled ratio between the above values | |
# The number of rows should be equal to 1 plus the estimated median fragment length. | |
set -euo pipefail | |
# Define software dependencies | |
declare -ra deps=("samtools" "bedtools" "computeGCBias" "pv") | |
# Check that required software dependencies are installed | |
for dep in "${deps[@]}" | |
do | |
command -v "$dep" >/dev/null 2>&1 || { echo >&2 "Error: $dep is required but not installed. Aborting."; exit 1; } | |
done | |
# Define input and output file paths | |
input_bam="$1" | |
output_prefix="$(basename "$input_bam" .bam | sed -E 's@(rmduplic-|merged-|-sorted|_P|_merge|-ready)@@g')" | |
# https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.2bit | |
# https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/hg38-blacklist.v2.bed.gz | |
genome="/mnt/sdd/data/db/hg38/hg38.2bit" | |
blacklist="/mnt/sdd/data/db/hg38/hg38-blacklist.v2.bed.gz" | |
# Define the number of CPUs to use | |
num_cpus=4 | |
# Define BAM fraction | |
frac=0.05 | |
# Define functions | |
# Check that the input BAM file exists | |
check_input_bam() { | |
if [ ! -f "${input_bam}" ]; then | |
echo "Error: file ${input_bam} not found" | |
exit 1 | |
fi | |
} | |
# Filter and downsample the input BAM file | |
filter_downsample() { | |
filesize=$(du -sb "${input_bam}" | awk '{printf "%.0f", $1 * "${frac}"}') | |
echo "Downsampling..." | |
samtools view -@ "${num_cpus}" -s "${frac}" -b -h -q 5 -f 3 -F 3852 -G 48 --incl-flags 48 \ | |
"${input_bam}" chr{1..22} \ | |
| pv -s "${filesize}" \ | |
| bedtools intersect -abam - -b "${blacklist}" -v \ | |
> "${output_prefix}.tmp.bam" | |
} | |
# Compute GC bias | |
compute_gc_bias() { | |
echo "Computing GC bias..." | |
computeGCBias \ | |
--numberOfProcessors "${num_cpus}" \ | |
--bamfile "${output_prefix}.tmp.bam" \ | |
--effectiveGenomeSize 2688205822 \ | |
--genome "${genome}" \ | |
--GCbiasFrequenciesFile "${output_prefix}.gc_bias.txt" \ | |
--biasPlot "${output_prefix}.gc_bias.png" \ | |
--plotFileFormat png | |
} | |
# Index the output BAM file | |
index_bam() { | |
echo "Indexing..." | |
filesize=$(du -sb "${output_prefix}.tmp.bam" | cut -f1) | |
cat "${output_prefix}.tmp.bam" | pv -s "${filesize}" | samtools index -@ "${num_cpus}" - -o "${output_prefix}.tmp.bam".bai | |
} | |
# Remove temporary files | |
cleanup() { | |
rm -f "${output_prefix}.tmp.bam" "${output_prefix}.tmp.bam.bai" | |
} | |
# Main function | |
main() { | |
check_input_bam | |
filter_downsample | |
index_bam | |
compute_gc_bias | |
cleanup | |
echo "Done!" | |
} | |
# Run main function | |
trap cleanup EXIT | |
main "$@" | |
exit 0 | |
# how to calculate effectiveGenomeSize | |
# index a genome | |
# samtools faidx /mnt/sdd/data/db/hg38/hg38.fa | |
# select chromosomes or regions | |
# samtools faidx /mnt/sdd/data/db/hg38/hg38.fa chr{1..22} | faCount /dev/stdin | |
# total 2875001522 | |
# calculate the size of blacklisted regions | |
# zcat /mnt/sdd/data/db/hg38/hg38-blacklist.v2.bed.gz \ | |
# | grep -w '^#\|chr[1-9]\|chr[1-2][0-9]' \ | |
# | awk '{ sum += ($3-$2); n++ } END { if (n > 0) print sum, n, NR;}' | |
# 186795700 486 486 | |
# effectiveGenomeSize: | |
# 2875001522 - 186795700 = 2688205822 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment