4DN consortium pipeline
Hi-C processing workflow based on# Dataset currently running
SRAid="SRR2184278"
# hg37 genome index
hg37_idx_path=/mnt/data/Resources/references/Grch37.r75.dna/Homo_sapiens.GRCh37.75.dna.fa
# Chromosome size file path
chr_sizes="/mnt/data/Sequencing/RPE-HiC/hg19.chr_size.noChrPrefix.txt"
fastq
from SRA
Dump # User fasterq-dump instead of fastq-dump for faster dumping
fasterq-dump ncbi/sra/"$SRAid".sra -S -O fastq -e 10 -vvvv
bwa mem
1. Align with # Map with bwa mem
bwa mem -SP5M -t 20 \
"$hg37_idx_path" \
fastq/"$SRAid".sra_1.fastq \
fastq/"$SRAid".sra_2.fastq \
| samtools view -Shb -@ 5 > bwa-out/"$SRAid".bam
# Or convert to bam with sambamba
sambamba view -f bam -S -h -p -t 20 \
-o bwa-out/"$SRAid".bam bwa-out/"$SRAid".sam
2. Parsing and sorting
!!!NOTE!!! The chromosome names in the $chr_sizes
file MUST match the headers of the reference genome files (i.e., the chromosome names in the output of bwa
), otherwise all reads will be categorized as unmapped from this step onward.
# Convert bam to sorted pairsam file
samtools view -h bwa-out/"$SRAid".bam | { \
pairtools parse -c "$chr_sizes" --add-columns mapq \
} | { \
pairtools sort --nproc 20 \
--memory 48G \
--compress-program "lz4c" \
--tmpdir pairsam \
--output pairsam/"$SRAid".sam.pairs.gz
}
3. Merging
This step should be used only to merge sequencing replicates!
# Merge all pairsam files into one
pairtools merge \
--max-nmerge $(ls pairsam/*.sam.pairs.gz | wc -l) \
--nproc 20 \
--memory 2G \
--output merged.sam.pairs.gz \
$(ls pairsam/*.sam.pairs.gz)
4. Marking duplicates
pairtools dedup --mark-dups --output-dups - --output-unmapped - --output marked.sam.pairs.gz merged.sam.pairs.gz
pairix marked.sam.pairs.gz # sanity check.
5. Filtering
## Generate lossless bam
pairtools split --output-sam lossless.bam marked.sam.pairs.gz
# Select UU, UR, RU reads
SRAid=SRR2184278_80
pairtools select '(pair_type == "UU") or (pair_type == "UR") or (pair_type == "RU")' \
--output-rest pairsam-dedup/"$SRAid".unmapped.sam.pairs.gz \
--output pairsam-dedup/"$SRAid".tmp.gz \
pairsam-marked/"$SRAid".marked.sam.pairs.gz
pairtools split --output-pairs pairsam-dedup/"$SRAid".tmp1.gz pairsam-dedup/"$SRAid".tmp.gz
pairtools select 'True' --chrom-subset $chr_sizes -o pairsam-dedup/"$SRAid".dedup.pairs.gz pairsam-dedup/"$SRAid".tmp1.gz
pairix pairsam-dedup/"$SRAid".dedup.pairs.gz # sanity check & indexing
rm pairsam-dedup/"$SRAid".tmp.gz pairsam-dedup/"$SRAid".tmp1.gz
6. Cooler
cooler cload pairix -p 10 -s 2 hg19.chr_size.noChrPrefix.txt:1000 \
pairsam-dedup/"$SRAid".dedup.pairs.gz cool/"$SRAid".cool
cooler zoomify -n 10 -o cool/"$SRAid".mcool -c 10000000 \
-r 1000,2000,5000,10000,25000,50000,100000,250000,500000,1000000,2500000,5000000,10000000 \
cool/"$SRAid".cool
7. Juicer
SRAid="SRR2184278-93"
# Convert to hic format
java -Xmx64g -Xms64g \
-jar /home/gire/ownCloud/BiCro/Code/vendor/juicer_tools.1.8.9_jcuda.0.8.jar \
pre -n pairsam-dedup/"$SRAid".dedup.pairs.gz hic/"$SRAid".hic \
hg19.chr_size.noChrPrefix.txt \
-r 1000,2000,5000,10000,25000,50000,100000,250000,500000,1000000,2500000,5000000,10000000 \
-q 0
# Add normalization
java -Xmx64g -Xms64g \
-jar /home/gire/ownCloud/BiCro/Code/vendor/juicer_tools.1.8.9_jcuda.0.8.jar \
addNorm -w 1000 -d -F hic/"$SRAid".hic
8. QC
pairsqc_root="/home/gire/ownCloud/BiCro/Code/vendor/pairsqc"
curdir=$(pwd)
cd "$pairsqc_root"
python3 pairsqc.py -p /mnt/data/Sequencing/RPE-HiC/pairsam-dedup/"$SRAid".dedup.pairs.gz -c /mnt/data/Sequencing/RPE-HiC/hg19.chr_size.noChrPrefix.txt -tP -s "$SRAid" -O "$SRAid"
Rscript plot.r 4 "$SRAid"_report
mv "$SRAid"_report $curdir/report
cd $curdir
9. FANC
Conversion
SRAid=SRR2184278-80
res="100kb"
fanc from-cooler cool/"$SRAid".mcool@100000 fanc/"$SRAid"."$res".fanc
Plotting
SRAid=SRR2184278-80
res="500kb"
fancplot -o plots/"$SRAid"."$res".square $(seq 1 22) X Y \
-p square fanc/"$SRAid"."$res".fanc
cd fanc
fanc expected -p ../plots/"$SRAid"."$res".oe.line.png \
"$SRAid"."$res".fanc "$SRAid"."$res".oe.tsv
fancplot -o ../plots/"$SRAid"."$res".oe.square $(seq 1 22) X Y \
-p square -e "$SRAid"."$res".fanc -vmin -2 -vmax 2
cd ..
fanc compartments -v fanc/"$SRAid"."$res".ev.txt \
fanc/"$SRAid"."$res".fanc fanc/"$SRAid"."$res".ab
fancplot -o plots/"$SRAid"."$res".ab.square $(seq 1 22) X Y \
-p square fanc/"$SRAid"."$res".ab -vmin -.075 -vmax 0.75 -c RdBu_r \
-p line fanc/"$SRAid"."$res".ev.txt