Hi-C processing workflow based on 4DN consortium pipeline
# Dataset currently running
SRAid="SRR2184278"
# hg37 genome index
hg37_idx_path=/mnt/data/Resources/references/Grch37.r75.dna/Homo_sapiens.GRCh37.75.dna.fa
# Chromosome size file path
chr_sizes="/mnt/data/Sequencing/RPE-HiC/hg19.chr_size.noChrPrefix.txt"
# User fasterq-dump instead of fastq-dump for faster dumping
fasterq-dump ncbi/sra/"$SRAid".sra -S -O fastq -e 10 -vvvv
# Map with bwa mem
bwa mem -SP5M -t 20 \
"$hg37_idx_path" \
fastq/"$SRAid".sra_1.fastq \
fastq/"$SRAid".sra_2.fastq \
| samtools view -Shb -@ 5 > bwa-out/"$SRAid".bam
# Or convert to bam with sambamba
sambamba view -f bam -S -h -p -t 20 \
-o bwa-out/"$SRAid".bam bwa-out/"$SRAid".sam
!!!NOTE!!! The chromosome names in the $chr_sizes
file MUST match the headers of the reference genome files (i.e., the chromosome names in the output of bwa
), otherwise all reads will be categorized as unmapped from this step onward.
# Convert bam to sorted pairsam file
samtools view -h bwa-out/"$SRAid".bam | { \
pairtools parse -c "$chr_sizes" --add-columns mapq \
} | { \
pairtools sort --nproc 20 \
--memory 48G \
--compress-program "lz4c" \
--tmpdir pairsam \
--output pairsam/"$SRAid".sam.pairs.gz
}
This step should be used only to merge sequencing replicates!
# Merge all pairsam files into one
pairtools merge \
--max-nmerge $(ls pairsam/*.sam.pairs.gz | wc -l) \
--nproc 20 \
--memory 2G \
--output merged.sam.pairs.gz \
$(ls pairsam/*.sam.pairs.gz)
pairtools dedup --mark-dups --output-dups - --output-unmapped - --output marked.sam.pairs.gz merged.sam.pairs.gz
pairix marked.sam.pairs.gz # sanity check.
## Generate lossless bam
pairtools split --output-sam lossless.bam marked.sam.pairs.gz
# Select UU, UR, RU reads
SRAid=SRR2184278_80
pairtools select '(pair_type == "UU") or (pair_type == "UR") or (pair_type == "RU")' \
--output-rest pairsam-dedup/"$SRAid".unmapped.sam.pairs.gz \
--output pairsam-dedup/"$SRAid".tmp.gz \
pairsam-marked/"$SRAid".marked.sam.pairs.gz
pairtools split --output-pairs pairsam-dedup/"$SRAid".tmp1.gz pairsam-dedup/"$SRAid".tmp.gz
pairtools select 'True' --chrom-subset $chr_sizes -o pairsam-dedup/"$SRAid".dedup.pairs.gz pairsam-dedup/"$SRAid".tmp1.gz
pairix pairsam-dedup/"$SRAid".dedup.pairs.gz # sanity check & indexing
rm pairsam-dedup/"$SRAid".tmp.gz pairsam-dedup/"$SRAid".tmp1.gz
cooler cload pairix -p 10 -s 2 hg19.chr_size.noChrPrefix.txt:1000 \
pairsam-dedup/"$SRAid".dedup.pairs.gz cool/"$SRAid".cool
cooler zoomify -n 10 -o cool/"$SRAid".mcool -c 10000000 \
-r 1000,2000,5000,10000,25000,50000,100000,250000,500000,1000000,2500000,5000000,10000000 \
cool/"$SRAid".cool
SRAid="SRR2184278-93"
# Convert to hic format
java -Xmx64g -Xms64g \
-jar /home/gire/ownCloud/BiCro/Code/vendor/juicer_tools.1.8.9_jcuda.0.8.jar \
pre -n pairsam-dedup/"$SRAid".dedup.pairs.gz hic/"$SRAid".hic \
hg19.chr_size.noChrPrefix.txt \
-r 1000,2000,5000,10000,25000,50000,100000,250000,500000,1000000,2500000,5000000,10000000 \
-q 0
# Add normalization
java -Xmx64g -Xms64g \
-jar /home/gire/ownCloud/BiCro/Code/vendor/juicer_tools.1.8.9_jcuda.0.8.jar \
addNorm -w 1000 -d -F hic/"$SRAid".hic
pairsqc_root="/home/gire/ownCloud/BiCro/Code/vendor/pairsqc"
curdir=$(pwd)
cd "$pairsqc_root"
python3 pairsqc.py -p /mnt/data/Sequencing/RPE-HiC/pairsam-dedup/"$SRAid".dedup.pairs.gz -c /mnt/data/Sequencing/RPE-HiC/hg19.chr_size.noChrPrefix.txt -tP -s "$SRAid" -O "$SRAid"
Rscript plot.r 4 "$SRAid"_report
mv "$SRAid"_report $curdir/report
cd $curdir
SRAid=SRR2184278-80
res="100kb"
fanc from-cooler cool/"$SRAid".mcool@100000 fanc/"$SRAid"."$res".fanc
SRAid=SRR2184278-80
res="500kb"
fancplot -o plots/"$SRAid"."$res".square $(seq 1 22) X Y \
-p square fanc/"$SRAid"."$res".fanc
cd fanc
fanc expected -p ../plots/"$SRAid"."$res".oe.line.png \
"$SRAid"."$res".fanc "$SRAid"."$res".oe.tsv
fancplot -o ../plots/"$SRAid"."$res".oe.square $(seq 1 22) X Y \
-p square -e "$SRAid"."$res".fanc -vmin -2 -vmax 2
cd ..
fanc compartments -v fanc/"$SRAid"."$res".ev.txt \
fanc/"$SRAid"."$res".fanc fanc/"$SRAid"."$res".ab
fancplot -o plots/"$SRAid"."$res".ab.square $(seq 1 22) X Y \
-p square fanc/"$SRAid"."$res".ab -vmin -.075 -vmax 0.75 -c RdBu_r \
-p line fanc/"$SRAid"."$res".ev.txt