Skip to content

Instantly share code, notes, and snippets.

@odinokov
Last active April 21, 2023 08:36
Show Gist options
  • Save odinokov/f2c17f89b668f4c3b21df61a611ad415 to your computer and use it in GitHub Desktop.
Save odinokov/f2c17f89b668f4c3b21df61a611ad415 to your computer and use it in GitHub Desktop.
Get GC% vs coverage per bin
#!/bin/bash
# This bash script performs statistical analysis on a specified BAM file to investigate GC bias.
# The analysis is done at a bin size of 100 base pairs.
# The script removes blacklisted regions and regions with N from the reference genome (hg38),
# then generates genomic bins limited to autosomes. For each bin, the GC content is computed.
# The input BAM file is downsampled, and the coverage of each bin is then calculated.
# The mean and median coverage for each bin with known GC content is reported.
set -euo pipefail
# Define software dependencies
declare -ra deps=("samtools" "bedtools" "datamash" "pv" "crabz")
# Check that required software dependencies are installed
for dep in "${deps[@]}"
do
command -v "$dep" >/dev/null 2>&1 || { echo >&2 "Error: $dep is required but not installed. Aborting."; exit 1; }
done
download_file() {
# Extract the filename from the URL
FILENAME=$(basename "$URL")
# Check if OUT is empty
if [ ! -z "$OUT" ]; then
FILENAME=${OUT%/}/${FILENAME}
fi
# Check if the file already exists and has been downloaded completely
if [ -e "$FILENAME" ] && [ ! -e "$FILENAME.tmp" ]; then
echo "File ${FILENAME} already exists."
return 0
fi
# Check if the partially downloaded file exists
if [ -e "$FILENAME.tmp" ]; then
# Continue downloading from where we left off
wget --continue --show-progress -qO "$FILENAME.tmp" "$URL"
else
# Download the file from scratch
wget --show-progress -qO "$FILENAME.tmp" "$URL"
fi
# Check if the download was successful
if [ $? -eq 0 ]; then
# Rename the tmp file to the original filename
mv "$FILENAME.tmp" "$FILENAME"
return 0
else
echo "Download failed."
exit 1
fi
}
get_reference() {
echo "Downloading blacklisted regions..."
mkdir -p "$OUT"
URL=https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/hg38-blacklist.v2.bed.gz
download_file
BL=$(basename "$URL")
echo "Downloading reference genome..."
URL=http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz
download_file
REF=$(basename "$URL" .gz)
if [ ! -e ${OUT}/${REF} ]; then
echo "Decompressing..."
crabz -d -p4 ${OUT}/${REF}.gz > ${OUT}/${REF}
echo "Indexing..."
samtools faidx ${OUT}/${REF}
fi
}
get_binned_GC_bed() {
echo "Preparing bins and computing GC%..."
WINDOW=100
FILESIZE=$(du -sb "${OUT}/${REF}" | awk '{printf "%.0f", $1}')
bedtools makewindows -w ${WINDOW} -s ${WINDOW} -g ${OUT}/${REF}.fai \
| pv -s ${FILESIZE} \
| grep -w '^#\|chr[1-9]\|chr[1-2][0-9]' \
| bedtools intersect -abam - -b ${OUT}/${BL} -v \
| bedtools nuc -fi ${OUT}/${REF} -pattern "N" -C -bed - \
| tail -n +2 \
| awk 'BEGIN {OFS="\t"}; {if ($13==0 && $5>0) print $1,$2,$3,".",".",$5}' \
> ${OUT}/${BINNED}
BINS=$(wc -l<${OUT}/${BINNED})
LC_ALL=en_US.UTF-8 printf "Total number of bins: %'d\n" ${BINS}
}
get_downsample() {
echo "Downsampling BAM file..."
FILESIZE=$(du -sb ${BAM} | awk '{printf "%.0f", $1*0.05}')
samtools view -@4 -s 0.05 -b -h -q 5 -f 3 -F 3852 -G 48 --incl-flags 48 ${BAM} chr{1..22} \
| pv -s ${FILESIZE} > ${BAM}.tmp
echo "Indexing..."
FILESIZE=$(du -sb ${BAM}.tmp | cut -f1)
echo ${FILESIZE}
cat ${BAM}.tmp | pv -s ${FILESIZE} | samtools index -@4 - -o ${BAM}.tmp.bai
}
get_coverage() {
echo "Reporting coverage over regions..."
samtools bedcov ${OUT}/${BINNED} ${BAM}.tmp \
| awk 'BEGIN {OFS="\t"}; {if ($7>0) print $6,$7}' \
| crabz -l 9 -p 4 > ${BAM}.tmp.bedcov.gz
}
get_GC_bias() {
echo "Calculating GC mean and median..."
crabz -d -p 4 ${BAM}.tmp.bedcov.gz \
| datamash --header-out -s -g1 mean 2 median 2 \
| crabz -l 9 -p 4 > ${BAM}.GCbias.txt.gz
}
cleanup() {
rm -f ${BAM}.tmp.bedcov.gz ${BAM}.tmp.bai ${BAM}.tmp
}
main() {
if [ -z ${1:-} ] || [ ! -f $1 ]; then
echo "File does not exist or the parameter is empty."
exit 1
fi
BAM=$1
OUT=./hg38
BINNED=hg38.binned_GC.bed
if [ ! -e ${OUT}/${BINNED} ]; then
echo "Generating genomic bins with GC content..."
get_reference
get_binned_GC_bed
else
echo "File ${BINNED} exists."
fi
get_downsample
get_coverage
get_GC_bias
cleanup
}
# Run main function
trap cleanup EXIT
main "$@"
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment