Last active
April 21, 2023 08:36
-
-
Save odinokov/f2c17f89b668f4c3b21df61a611ad415 to your computer and use it in GitHub Desktop.
Get GC% vs coverage per bin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This bash script performs statistical analysis on a specified BAM file to investigate GC bias. | |
# The analysis is done at a bin size of 100 base pairs. | |
# The script removes blacklisted regions and regions with N from the reference genome (hg38), | |
# then generates genomic bins limited to autosomes. For each bin, the GC content is computed. | |
# The input BAM file is downsampled, and the coverage of each bin is then calculated. | |
# The mean and median coverage for each bin with known GC content is reported. | |
set -euo pipefail | |
# Define software dependencies | |
declare -ra deps=("samtools" "bedtools" "datamash" "pv" "crabz") | |
# Check that required software dependencies are installed | |
for dep in "${deps[@]}" | |
do | |
command -v "$dep" >/dev/null 2>&1 || { echo >&2 "Error: $dep is required but not installed. Aborting."; exit 1; } | |
done | |
download_file() { | |
# Extract the filename from the URL | |
FILENAME=$(basename "$URL") | |
# Check if OUT is empty | |
if [ ! -z "$OUT" ]; then | |
FILENAME=${OUT%/}/${FILENAME} | |
fi | |
# Check if the file already exists and has been downloaded completely | |
if [ -e "$FILENAME" ] && [ ! -e "$FILENAME.tmp" ]; then | |
echo "File ${FILENAME} already exists." | |
return 0 | |
fi | |
# Check if the partially downloaded file exists | |
if [ -e "$FILENAME.tmp" ]; then | |
# Continue downloading from where we left off | |
wget --continue --show-progress -qO "$FILENAME.tmp" "$URL" | |
else | |
# Download the file from scratch | |
wget --show-progress -qO "$FILENAME.tmp" "$URL" | |
fi | |
# Check if the download was successful | |
if [ $? -eq 0 ]; then | |
# Rename the tmp file to the original filename | |
mv "$FILENAME.tmp" "$FILENAME" | |
return 0 | |
else | |
echo "Download failed." | |
exit 1 | |
fi | |
} | |
get_reference() { | |
echo "Downloading blacklisted regions..." | |
mkdir -p "$OUT" | |
URL=https://raw.githubusercontent.com/Boyle-Lab/Blacklist/master/lists/hg38-blacklist.v2.bed.gz | |
download_file | |
BL=$(basename "$URL") | |
echo "Downloading reference genome..." | |
URL=http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz | |
download_file | |
REF=$(basename "$URL" .gz) | |
if [ ! -e ${OUT}/${REF} ]; then | |
echo "Decompressing..." | |
crabz -d -p4 ${OUT}/${REF}.gz > ${OUT}/${REF} | |
echo "Indexing..." | |
samtools faidx ${OUT}/${REF} | |
fi | |
} | |
get_binned_GC_bed() { | |
echo "Preparing bins and computing GC%..." | |
WINDOW=100 | |
FILESIZE=$(du -sb "${OUT}/${REF}" | awk '{printf "%.0f", $1}') | |
bedtools makewindows -w ${WINDOW} -s ${WINDOW} -g ${OUT}/${REF}.fai \ | |
| pv -s ${FILESIZE} \ | |
| grep -w '^#\|chr[1-9]\|chr[1-2][0-9]' \ | |
| bedtools intersect -abam - -b ${OUT}/${BL} -v \ | |
| bedtools nuc -fi ${OUT}/${REF} -pattern "N" -C -bed - \ | |
| tail -n +2 \ | |
| awk 'BEGIN {OFS="\t"}; {if ($13==0 && $5>0) print $1,$2,$3,".",".",$5}' \ | |
> ${OUT}/${BINNED} | |
BINS=$(wc -l<${OUT}/${BINNED}) | |
LC_ALL=en_US.UTF-8 printf "Total number of bins: %'d\n" ${BINS} | |
} | |
get_downsample() { | |
echo "Downsampling BAM file..." | |
FILESIZE=$(du -sb ${BAM} | awk '{printf "%.0f", $1*0.05}') | |
samtools view -@4 -s 0.05 -b -h -q 5 -f 3 -F 3852 -G 48 --incl-flags 48 ${BAM} chr{1..22} \ | |
| pv -s ${FILESIZE} > ${BAM}.tmp | |
echo "Indexing..." | |
FILESIZE=$(du -sb ${BAM}.tmp | cut -f1) | |
echo ${FILESIZE} | |
cat ${BAM}.tmp | pv -s ${FILESIZE} | samtools index -@4 - -o ${BAM}.tmp.bai | |
} | |
get_coverage() { | |
echo "Reporting coverage over regions..." | |
samtools bedcov ${OUT}/${BINNED} ${BAM}.tmp \ | |
| awk 'BEGIN {OFS="\t"}; {if ($7>0) print $6,$7}' \ | |
| crabz -l 9 -p 4 > ${BAM}.tmp.bedcov.gz | |
} | |
get_GC_bias() { | |
echo "Calculating GC mean and median..." | |
crabz -d -p 4 ${BAM}.tmp.bedcov.gz \ | |
| datamash --header-out -s -g1 mean 2 median 2 \ | |
| crabz -l 9 -p 4 > ${BAM}.GCbias.txt.gz | |
} | |
cleanup() { | |
rm -f ${BAM}.tmp.bedcov.gz ${BAM}.tmp.bai ${BAM}.tmp | |
} | |
main() { | |
if [ -z ${1:-} ] || [ ! -f $1 ]; then | |
echo "File does not exist or the parameter is empty." | |
exit 1 | |
fi | |
BAM=$1 | |
OUT=./hg38 | |
BINNED=hg38.binned_GC.bed | |
if [ ! -e ${OUT}/${BINNED} ]; then | |
echo "Generating genomic bins with GC content..." | |
get_reference | |
get_binned_GC_bed | |
else | |
echo "File ${BINNED} exists." | |
fi | |
get_downsample | |
get_coverage | |
get_GC_bias | |
cleanup | |
} | |
# Run main function | |
trap cleanup EXIT | |
main "$@" | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment