Skip to content

Instantly share code, notes, and snippets.

@finswimmer
Created August 16, 2019 05:34
Show Gist options
  • Save finswimmer/877d678d10ffb35b87e954e74d2f17eb to your computer and use it in GitHub Desktop.
Save finswimmer/877d678d10ffb35b87e954e74d2f17eb to your computer and use it in GitHub Desktop.
rule target:
input:
bam = "SRR7526729_realigned.bam",
index = "SRR7526729_realigned.bam.bai"
rule umi_extract:
input:
R1 = "../{sample}_1.fastq.gz",
R2 = "../{sample}_2.fastq.gz"
output:
R1_UMI = "{sample}_1_umi.fastq.gz",
R2_UMI = "{sample}_2_umi.fastq.gz"
log:
"{sample}_umi_extract.log"
shell:
"""
paste <(unpigz -c {input.R1}|paste - - - -) <(unpigz -c {input.R2}|paste - - - -)|mawk -v FS="\\t" -v OFS="\\n" '{{umi = substr($6,1,12); split($1, R1, " "); split($5, R2, " "); print R1[1]"_"umi,$2,$3,$4|"pigz -c > {output.R1_UMI}"; print R2[1]"_"umi,substr($6,24),$7,substr($8,24)|"pigz -c > {output.R2_UMI}"}}'
"""
rule trim_fastq:
input:
R1_UMI = "{sample}_1_umi.fastq.gz",
R2_UMI = "{sample}_2_umi.fastq.gz"
output:
R1_TRIM = temp("{sample}_1_trim.fastq.gz"),
R2_TRIM = temp("{sample}_2_trim.fastq.gz")
shell:
"""
bbmerge.sh in={input.R1_UMI} in2={input.R2_UMI} out={output.R1_TRIM} out2={output.R2_TRIM} adapter=CAAAACGCAATACTGTACATT overwrite=true tbo ecco=t
"""
rule align_sort:
input:
R1_TRIM = "{sample}_1_trim.fastq.gz",
R2_TRIM = "{sample}_2_trim.fastq.gz"
output:
bam = temp("{sample}_alignment.bam")
params:
ref = "Homo_sapiens.GRCh37.dna.primary_assembly.fa",
tmp_dir ="./"
threads:
8
shell:
"""
bwa mem -R '@RG\\tID:SRR7526728\\tPL:ILLUMINA\\tSM:SRR7526728' -t 6 {params.ref} -M {input.R1_TRIM} {input.R2_TRIM} \\
| awk -v FS="\\t" -v OFS="\\t" '/^@/ {{print; next;}} {{ split($1, id, "_"); print $0,"RX:Z:"id[2];}}' \\
| fgbio SortBam -Djava.io.tmpdir={params.tmp_dir} -i /dev/stdin -s Queryname -o /dev/stdout \\
| fgbio SetMateInformation -o {output.bam}
"""
rule GroupReadsByUmi:
input:
bam = "{sample}_alignment.bam"
output:
bam = "{sample}_group.bam"
params:
tmp_dir ="./"
shell:
"""
fgbio GroupReadsByUmi -Djava.io.tmpdir={params.tmp_dir} -i {input.bam} -s adjacency -o {output.bam}
"""
rule CallMolecularConsensusReads:
input:
bam = "{sample}_group.bam"
output:
bam = "{sample}_consensus.bam"
params:
tmp_dir ="./"
shell:
"""
fgbio SortBam -Djava.io.tmpdir={params.tmp_dir} -s TemplateCoordinate -i {input.bam} -o /dev/stdout | fgbio CallMolecularConsensusReads -Djava.io.tmpdir={params.tmp_dir} -m 30 -M 1 -i /dev/stdin -o {output.bam}
"""
rule FilterConsensusReads:
input:
bam = "{sample}_consensus.bam"
output:
R1_consensus = "{sample}_R1_consensus.fastq.gz",
R2_consensus = "{sample}_R2_consensus.fastq.gz"
params:
ref = "Homo_sapiens.GRCh37.dna.primary_assembly.fa",
tmp_dir ="./"
threads:
8
shell:
"""
fgbio FilterConsensusReads -Djava.io.tmpdir={params.tmp_dir} -i {input.bam} -o /dev/stdout -r {params.ref} -M 4 -N 1 -E 0.010 \\
| samtools sort -@ 6 -n \\
| samtools fastq -1 {output.R1_consensus} -2 {output.R2_consensus} -
"""
rule AlignConsenus:
input:
R1_consensus = "{sample}_R1_consensus.fastq.gz",
R2_consensus = "{sample}_R2_consensus.fastq.gz"
output:
bam = "{sample}_realigned.bam",
index = "{sample}_realigned.bam.bai"
threads:
8
params:
ref = "Homo_sapiens.GRCh37.dna.primary_assembly.fa"
shell:
"""
bwa mem -R '@RG\\tID:SRR7526728\\tPL:ILLUMINA\\tSM:SRR7526728' -t 6 {params.ref} -M {input.R1_consensus} {input.R2_consensus}\\
| samtools sort -@6 -o {output.bam} -
samtools index {output.bam}
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment