Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@afrendeiro
Created November 14, 2016 15:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save afrendeiro/f91ac2c554557eb2f1e4fbe8f234e14e to your computer and use it in GitHub Desktop.
Save afrendeiro/f91ac2c554557eb2f1e4fbe8f234e14e to your computer and use it in GitHub Desktop.
Scripts to produce Quantile bigWig files as seen in http://www.nature.com/articles/ncomms11938/figures/2
from pipelines.models import Project
import os
import pandas as pd
import pysam
# Start project
prj = Project("metadata/project_config.yaml")
prj.add_sample_sheet()
bams = [sample.filtered for sample in prj.samples]
sizes = [int(pysam.Samfile(sample.filtered).mapped) for sample in prj.samples]
ratios = [size / 10000000. for size in sizes]
# write to disk
pd.Series(bams).to_csv(os.path.join(prj.dirs.data, "bamfiles.txt"), index=False)
pd.Series(ratios).to_csv(os.path.join(prj.dirs.data, "bamsizes.txt"), index=False)
# Start bash script to quantilize tracks
os.system("quantile_tracks.sh")
function make_windows() {
CHROM=$1
COUNT=`grep -P "^chr${CHROM}\t" resources/genomes/hg19/hg19.chrom.sizes | cut -f 2`
echo "chr$CHROM 1 $COUNT" | sed 's/ /\t/g' > ~/windows/tmp_${CHROM}.bed
}
export -f make_windows
function windows_position() {
CHROM=$1
bedtools makewindows -b ~/windows/tmp_${CHROM}.bed -w 1 | cut -f 1,2,3 > ~/windows/${CHROM}_position.bed
}
export -f windows_position
function get_coverage() {
CHROM=$1
BAM=$2
NAME=`basename ${BAM/.trimmed.bowtie2.filtered.bam/}`
RATIO=$3
echo "#! /bin/bash" > job.sh
echo "#SBATCH --partition=shortq" >> job.sh
echo "#SBATCH --ntasks=1" >> job.sh
echo "#SBATCH --cpus-per-task=2" >> job.sh
echo "#SBATCH --mem-per-cpu=4000" >> job.sh
echo "#SBATCH --job-name=${NAME}_chr${CHROM}" >> job.sh
echo "#SBATCH --output=/home/arendeiro/coverage/${NAME}_${CHROM}.log" >> job.sh
echo "hostname" >> job.sh
echo "date" >> job.sh
echo "bedtools coverage -d -abam $BAM -b ~/windows/tmp_${CHROM}.bed | cut -f 5 | awk '{print \$1/${RATIO}}' > ~/coverage/${NAME}_${CHROM}.coverage.bed" >> job.sh
echo "date" >> job.sh
sbatch job.sh
}
export -f get_coverage
function paste_together() {
CHROM=$1
#load bgzip
module load htslib
# read bams
readarray BAMS < /home/arendeiro/cll-patients/data/bamfiles.txt
# get sample and file names
i=0
for BAM in ${BAMS[*]}
do
NAMES[i]=`basename ${BAM/.trimmed.bowtie2.filtered.bam/}`
FILENAMES[i]=~/coverage/${NAMES[i]}_${CHROM}.coverage.bed
let i=i+1
done
echo "#! /bin/bash" > job.sh
echo "#SBATCH --partition=shortq" >> job.sh
echo "#SBATCH --ntasks=1" >> job.sh
echo "#SBATCH --cpus-per-task=2" >> job.sh
echo "#SBATCH --mem-per-cpu=4000" >> job.sh
echo "#SBATCH --job-name=${NAME}_chr${CHROM}" >> job.sh
echo "#SBATCH --output=/home/arendeiro/coverage/${NAME}_${CHROM}.log" >> job.sh
echo "hostname" >> job.sh
echo "date" >> job.sh
echo "paste ~/windows/${CHROM}_position.bed ${FILENAMES[*]} | bgzip > ~/coverage/all_chr_${CHROM}.bed.gz" >> job.sh
echo "date" >> job.sh
sbatch job.sh
}
export -f paste_together
function split_files() {
FILE=$1
NAME=${FILE/.bed.gz/}
echo "#! /bin/bash" > job.sh
echo "#SBATCH --partition=shortq" >> job.sh
echo "#SBATCH --ntasks=1" >> job.sh
echo "#SBATCH --cpus-per-task=1" >> job.sh
echo "#SBATCH --mem-per-cpu=4000" >> job.sh
echo "#SBATCH --job-name=${NAME}_chr${CHROM}" >> job.sh
echo "#SBATCH --output=/scratch/users/arendeiro/logs/split_files_${NAME}.log" >> job.sh
echo "hostname" >> job.sh
echo "date" >> job.sh
echo "zcat $FILE | split -a 5 -d -l 1000000 - ${NAME}_" >> job.sh
echo "date" >> job.sh
sbatch job.sh
}
export -f split_files
function compress(){
CHROM=$1
echo "#! /bin/bash" > job.sh
echo "#SBATCH --partition=shortq" >> job.sh
echo "#SBATCH --ntasks=1" >> job.sh
echo "#SBATCH --cpus-per-task=1" >> job.sh
echo "#SBATCH --mem-per-cpu=2000" >> job.sh
echo "#SBATCH --job-name=chr${CHROM}" >> job.sh
echo "hostname" >> job.sh
echo "date" >> job.sh
echo "gzip all_chr_${CHROM}_* ">> job.sh
echo "date" >> job.sh
sbatch job.sh
}
export -f compress
function quantilize() {
FILE=$1
echo "#! /bin/bash" > job.sh
echo "#SBATCH --partition=shortq" >> job.sh
echo "#SBATCH --ntasks=1" >> job.sh
echo "#SBATCH --cpus-per-task=1" >> job.sh
echo "#SBATCH --mem-per-cpu=2000" >> job.sh
echo "#SBATCH --job-name=quantilize_${FILE}" >> job.sh
echo "#SBATCH --output=/scratch/users/arendeiro/logs/quantilize_files_${FILE}.log" >> job.sh
echo "hostname" >> job.sh
echo "date" >> job.sh
echo "zcat $FILE | python ~/quantilize.py /fhgfs/groups/lab_bock/arendeiro/coverage/$FILE" >> job.sh
echo "date" >> job.sh
sbatch job.sh
}
export -f quantilize
function concatenate_back() {
SUFFIX=$1
echo "#! /bin/bash" > job.sh
echo "#SBATCH --partition=longq" >> job.sh
echo "#SBATCH --ntasks=1" >> job.sh
echo "#SBATCH --cpus-per-task=4" >> job.sh
echo "#SBATCH --mem-per-cpu=8000" >> job.sh
echo "#SBATCH --job-name=concatenate_back_${FILE}" >> job.sh
echo "#SBATCH --output=/scratch/users/arendeiro/logs/concatenate_back_files_${FILE}.log" >> job.sh
echo "hostname" >> job.sh
echo "date" >> job.sh
echo "cd /fhgfs/groups/lab_bock/arendeiro/coverage/" >> job.sh
echo "INPUT=(\`ls -v | grep -e all_chr_.*_${SUFFIX} | sort -z\`)" >> job.sh
echo "cat \${INPUT[@]} > all_${SUFFIX}.bedgraph" >> job.sh
echo "date" >> job.sh
sbatch job.sh
}
export -f concatenate_back
function bedgraph_to_bigwig() {
BEDGRAPH=$1
BIGWIG=${BEDGRAPH/bedgraph/bigwig}
echo "#! /bin/bash" > job.sh
echo "#SBATCH --partition=longq" >> job.sh
echo "#SBATCH --ntasks=1" >> job.sh
echo "#SBATCH --cpus-per-task=2" >> job.sh
echo "#SBATCH --mem-per-cpu=100000" >> job.sh
echo "#SBATCH --job-name=bedgraph_to_bigwig_${BEDGRAPH}" >> job.sh
echo "#SBATCH --output=/scratch/users/arendeiro/logs/bedgraph_to_bigwig_${BEDGRAPH}.log" >> job.sh
echo "hostname" >> job.sh
echo "date" >> job.sh
echo "cd /fhgfs/groups/lab_bock/arendeiro/coverage/" >> job.sh
echo "bedGraphToBigWig all_${BEDGRAPH} ~/resources/genomes/hg19/hg19.chromSizes ~/coverage/all_${BIGWIG}" >> job.sh
echo "date" >> job.sh
sbatch job.sh
}
export -f bedgraph_to_bigwig
# Start
# make 1bp windows for each chromosome
parallel make_windows ::: {1..22}
# get bam files
readarray BAMS < /home/arendeiro/cll-patients/data/bamfiles.txt
readarray RATIOS < /home/arendeiro/cll-patients/data/bamsizes.txt
# get coverage in each chromosome for each sample
for INDEX in ${!BAMS[@]}
do
for CHROM in {1..22}
do
get_coverage $CHROM ${BAMS[INDEX]} ${RATIOS[INDEX]}
done
done
# while you wait for those jobs,
parallel windows_position ::: {1..22}
# concatenate columns across samples for each chromosome
for CHROM in {1..22}
do
paste_together $CHROM
done
# split files in chunks
for CHROM in {1..22}
do
split_files all_chr_$CHROM.bed.gz
done
# compress file chunks
for CHROM in {1..22}
do
compress $CHROM
done
# get quantiles and mean
for FILE in `ls ~/coverage/ | grep -v bed | grep -v job`
do
quantilize $FILE
done
# concat chuncks across chromosomes
for SUFFIX in quant5 quant25 mean quant75 quant95
do
concatenate_back $SUFFIX
done
# make bigwigs
for SUFFIX in quant5 quant25 mean quant75 quant95
do
bedgraph_to_bigwig ${SUFFIX}.bedgraph
done
"""
Script to compute quantiles for a set of data.
Reads from standard input and writes (pseudo) BED files.
"""
import sys
import csv
import numpy as np
prefix = sys.argv[1]
wr5 = csv.writer(open("_".join([prefix, "quant5.bed"]), 'w'), delimiter='\t', lineterminator='\n')
wr25 = csv.writer(open("_".join([prefix, "quant25.bed"]), 'w'), delimiter='\t', lineterminator='\n')
wrm = csv.writer(open("_".join([prefix, "mean.bed"]), 'w'), delimiter='\t', lineterminator='\n')
wr75 = csv.writer(open("_".join([prefix, "quant75.bed"]), 'w'), delimiter='\t', lineterminator='\n')
wr95 = csv.writer(open("_".join([prefix, "quant95.bed"]), 'w'), delimiter='\t', lineterminator='\n')
for row in csv.reader(iter(sys.stdin.readline, ''), delimiter='\t'):
# get values
values = np.array(sorted(row[4:]), np.float)
# calculate quantiles and writerow
wr5.writerow(row[:3] + [np.percentile(values, 5)])
wr25.writerow(row[:3] + [np.percentile(values, 25)])
wrm.writerow(row[:3] + [values.mean()])
wr75.writerow(row[:3] + [np.percentile(values, 75)])
wr95.writerow(row[:3] + [np.percentile(values, 95)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment