Skip to content

Instantly share code, notes, and snippets.

@lpryszcz
Last active April 21, 2017 15:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lpryszcz/0470f25049646b774f3c78da0b3cbc95 to your computer and use it in GitHub Desktop.
Save lpryszcz/0470f25049646b774f3c78da0b3cbc95 to your computer and use it in GitHub Desktop.
Redundans test dataset
## Redundans scaffolds vs reference
cd run1
# align
nucmer -p nucmer.ref ../ref.fa scaffolds.filled.fa
# generate pairwise plot
mummerplot --png --large nucmer.ref.delta -p nucmer.ref.plot
# open plot
eog nucmer.ref.plot.png
## SPAdes contigs vs Redundans scaffolds
cd run1
# align
nucmer -p nucmer.contigs scaffolds.filled.fa ../contigs.fa
# generate pairwise plot
mummerplot --png --large nucmer.contigs.delta -p nucmer.contigs.plot
# open plot
eog nucmer.contigs.plot.png
## dipSPAdes consensus contigs vs reference
# align
nucmer -p nucmer.ref_dipspades ref.fa consensus_contigs.fa
# generate pairwise plot
mummerplot --png --large nucmer.ref_dipspades.delta -p nucmer.ref_dipspades.plot
# open plot
eog nucmer.ref_dipspades.plot.png
# the same can be done using LAST (this will work also for large genomes)
## Redundans scaffolds vs reference
# index reference genome (need to be done only once per each reference)
if [ ! -s ../ref.fa.suf ]; then lastdb ../ref.fa ../ref.fa; if
# align & generate dotplot on the fly
lastal -f TAB ../ref.fa scaffolds.filled.fa | last-dotplot - scaffolds.filled.fa.png
# open plot
eog scaffolds.filled.fa.png
###
# All programs/scripts not disclosed in Redundans repository (i.e. fasta2diverged.py),
# can be found in https://github.com/lpryszcz/bin.
###
# get chromosome & select region
wget "http://www.ebi.ac.uk/ena/data/view/HE605202&download=fasta&filename=CANPA.fa"
samtools faidx CANPA.fa "ENA|HE605202|HE605202.1:100000-200000" > ref.fa
# get heterozygous regions
fasta2diverged.py -v -i ref.fa --loh 0.40 --dna -d 0.05 > ref.d05.l40.fa
# simulate reads
# generate SNPs
~/src/GemSIM_v1.6/GemHaps.py -r ref.fa -g '.50,0 .50,6'
# 2x 50,000 reads
## 100 bp paired-end with 600 bp insert +/- 50 bp
~/src/GemSIM_v1.6/GemReads.py -r ref.fa -n 50000 -g ref.txt -l 100 -m /home/lpryszcz/src/GemSIM_v1.6/models/ill100v5_p.gzip -q 33 -o 600 -u 600 -s 50 -p
~/src/GemSIM_v1.6/GemReads.py -r ref.d05.l40.fa -n 50000 -g ref.txt -l 100 -m /home/lpryszcz/src/GemSIM_v1.6/models/ill100v5_p.gzip -q 33 -o 600.d05.l40 -u 600 -s 50 -p
# 2x 10,000 reads
## 50 bp mate-pairs with 5 kb insert +/- 500 bp
~/src/GemSIM_v1.6/GemReads.py -r ref.fa -n 10000 -g ref.txt -l 50 -m /home/lpryszcz/src/GemSIM_v1.6/models/ill100v5_p.gzip -q 33 -o 5000 -u 5000 -s 500 -p
~/src/GemSIM_v1.6/GemReads.py -r ref.d05.l40.fa -n 10000 -g ref.txt -l 50 -m /home/lpryszcz/src/GemSIM_v1.6/models/ill100v5_p.gzip -q 33 -o 5000.d05.l40 -u 5000 -s 500 -p
# combine both libraries in random order
## 600_
paste <(zcat 600_1.fastq.gz 600.d05.l40_1.fastq.gz) <(zcat 600_2.fastq.gz 600.d05.l40_2.fastq.gz) | paste - - - - | shuf | awk -F'\t' '{OFS="\n"; print $1,$3,$5,$7 | "gzip > random.600.d05.l40_1.fq.gz"; print $2,$4,$6,$8 | "gzip > random.600.d05.l40_2.fq.gz"}'
## 5000_
paste <(zcat 5000_1.fastq.gz 600.d05.l40_1.fastq.gz) <(zcat 5000_2.fastq.gz 5000.d05.l40_2.fastq.gz) | paste - - - - | shuf | awk -F'\t' '{OFS="\n"; print $1,$3,$5,$7 | "gzip > random.5000.d05.l40_1.fq.gz"; print $2,$4,$6,$8 | "gzip > random.5000.d05.l40_2.fq.gz"}'
# symlinks with more handy names
for f in *.fq.gz; do ln -s $f `echo $f | sed 's/random.//;s/.d05.l40//'`; done
# mate-pairs contaminated with paired-end reads
#for i in 1 2; do zcat <(zcat test/5000_$i.fq.gz test/5000_$i.fq.gz | fastx2reverse_complement.py fastq | gzip) test/600_$i.fq.gz > test/mixed.$i.fq.gz; done
# assembly SPAdes v3.5.0
dipspades.py --only-assembler -t 4 -o 600 -1 600_1.fq.gz -2 600_2.fq.gz
# run redundans
~/src/redundans/redundans.py -v -i *.fq.gz -f 600/spades/contigs.fasta -o run1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment