Skip to content

Instantly share code, notes, and snippets.

@ggirelli
Last active January 29, 2024 21:57
Show Gist options
  • Save ggirelli/bbc52daad8f16564777785fbe98ec6ed to your computer and use it in GitHub Desktop.
Save ggirelli/bbc52daad8f16564777785fbe98ec6ed to your computer and use it in GitHub Desktop.
Hi-C processing

Hi-C processing workflow based on 4DN consortium pipeline

  1. Dumping
  2. Alignment
  3. Parsing & sorting
  4. Merging
  5. Duplicates
  6. Filtering
  7. Cooler
  8. Juicer
  9. QC
  10. FANC
# Dataset currently running
SRAid="SRR2184278"

# hg37 genome index
hg37_idx_path=/mnt/data/Resources/references/Grch37.r75.dna/Homo_sapiens.GRCh37.75.dna.fa

# Chromosome size file path
chr_sizes="/mnt/data/Sequencing/RPE-HiC/hg19.chr_size.noChrPrefix.txt"

Dump fastq from SRA

# User fasterq-dump instead of fastq-dump for faster dumping
fasterq-dump ncbi/sra/"$SRAid".sra -S -O fastq -e 10 -vvvv

1. Align with bwa mem

# Map with bwa mem
bwa mem -SP5M -t 20 \
  "$hg37_idx_path" \
  fastq/"$SRAid".sra_1.fastq \
  fastq/"$SRAid".sra_2.fastq \
  | samtools view -Shb -@ 5 > bwa-out/"$SRAid".bam
# Or convert to bam with sambamba
sambamba view -f bam -S -h -p -t 20 \
  -o bwa-out/"$SRAid".bam bwa-out/"$SRAid".sam 

2. Parsing and sorting

!!!NOTE!!! The chromosome names in the $chr_sizes file MUST match the headers of the reference genome files (i.e., the chromosome names in the output of bwa), otherwise all reads will be categorized as unmapped from this step onward.

# Convert bam to sorted pairsam file
samtools view -h bwa-out/"$SRAid".bam | { \
  pairtools parse -c "$chr_sizes" --add-columns mapq \
} | { \
  pairtools sort --nproc 20 \
  --memory 48G \
  --compress-program "lz4c" \
  --tmpdir pairsam \
  --output pairsam/"$SRAid".sam.pairs.gz
}

3. Merging

This step should be used only to merge sequencing replicates!

# Merge all pairsam files into one 
pairtools merge \
  --max-nmerge $(ls pairsam/*.sam.pairs.gz | wc -l) \
  --nproc 20 \
  --memory 2G \
  --output merged.sam.pairs.gz \
  $(ls pairsam/*.sam.pairs.gz)

4. Marking duplicates

pairtools dedup --mark-dups --output-dups - --output-unmapped - --output marked.sam.pairs.gz merged.sam.pairs.gz
pairix marked.sam.pairs.gz  # sanity check.

5. Filtering

## Generate lossless bam
pairtools split --output-sam lossless.bam marked.sam.pairs.gz

# Select UU, UR, RU reads
SRAid=SRR2184278_80
pairtools select '(pair_type == "UU") or (pair_type == "UR") or (pair_type == "RU")' \
    --output-rest pairsam-dedup/"$SRAid".unmapped.sam.pairs.gz \
    --output pairsam-dedup/"$SRAid".tmp.gz \
    pairsam-marked/"$SRAid".marked.sam.pairs.gz
pairtools split --output-pairs pairsam-dedup/"$SRAid".tmp1.gz pairsam-dedup/"$SRAid".tmp.gz
pairtools select 'True' --chrom-subset $chr_sizes -o pairsam-dedup/"$SRAid".dedup.pairs.gz pairsam-dedup/"$SRAid".tmp1.gz
pairix pairsam-dedup/"$SRAid".dedup.pairs.gz  # sanity check & indexing
rm pairsam-dedup/"$SRAid".tmp.gz pairsam-dedup/"$SRAid".tmp1.gz

6. Cooler

cooler cload pairix -p 10 -s 2 hg19.chr_size.noChrPrefix.txt:1000 \
  pairsam-dedup/"$SRAid".dedup.pairs.gz cool/"$SRAid".cool

cooler zoomify -n 10 -o cool/"$SRAid".mcool -c 10000000 \
  -r 1000,2000,5000,10000,25000,50000,100000,250000,500000,1000000,2500000,5000000,10000000 \
  cool/"$SRAid".cool

7. Juicer

SRAid="SRR2184278-93"

# Convert to hic format
java -Xmx64g -Xms64g \
    -jar /home/gire/ownCloud/BiCro/Code/vendor/juicer_tools.1.8.9_jcuda.0.8.jar \
    pre -n pairsam-dedup/"$SRAid".dedup.pairs.gz hic/"$SRAid".hic \
    hg19.chr_size.noChrPrefix.txt \
    -r 1000,2000,5000,10000,25000,50000,100000,250000,500000,1000000,2500000,5000000,10000000 \
    -q 0

# Add normalization
java -Xmx64g -Xms64g \
    -jar /home/gire/ownCloud/BiCro/Code/vendor/juicer_tools.1.8.9_jcuda.0.8.jar \
    addNorm -w 1000 -d -F hic/"$SRAid".hic

8. QC

pairsqc_root="/home/gire/ownCloud/BiCro/Code/vendor/pairsqc"
curdir=$(pwd)
cd "$pairsqc_root"
python3 pairsqc.py -p /mnt/data/Sequencing/RPE-HiC/pairsam-dedup/"$SRAid".dedup.pairs.gz -c /mnt/data/Sequencing/RPE-HiC/hg19.chr_size.noChrPrefix.txt -tP -s "$SRAid" -O "$SRAid"
Rscript plot.r 4 "$SRAid"_report
mv "$SRAid"_report $curdir/report
cd $curdir

9. FANC

Conversion

SRAid=SRR2184278-80
res="100kb"
fanc from-cooler cool/"$SRAid".mcool@100000 fanc/"$SRAid"."$res".fanc

Plotting

SRAid=SRR2184278-80
res="500kb"

fancplot -o plots/"$SRAid"."$res".square $(seq 1 22) X Y \
  -p square fanc/"$SRAid"."$res".fanc

cd fanc
fanc expected -p ../plots/"$SRAid"."$res".oe.line.png \
  "$SRAid"."$res".fanc "$SRAid"."$res".oe.tsv
fancplot -o ../plots/"$SRAid"."$res".oe.square $(seq 1 22) X Y \
  -p square -e "$SRAid"."$res".fanc -vmin -2 -vmax 2
cd ..

fanc compartments -v fanc/"$SRAid"."$res".ev.txt \
  fanc/"$SRAid"."$res".fanc fanc/"$SRAid"."$res".ab
fancplot -o plots/"$SRAid"."$res".ab.square $(seq 1 22) X Y \
  -p square fanc/"$SRAid"."$res".ab -vmin -.075 -vmax 0.75 -c RdBu_r \
  -p line fanc/"$SRAid"."$res".ev.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment