Last active
April 21, 2017 15:33
-
-
Save lpryszcz/0470f25049646b774f3c78da0b3cbc95 to your computer and use it in GitHub Desktop.
Redundans test dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Redundans scaffolds vs reference | |
cd run1 | |
# align | |
nucmer -p nucmer.ref ../ref.fa scaffolds.filled.fa | |
# generate pairwise plot | |
mummerplot --png --large nucmer.ref.delta -p nucmer.ref.plot | |
# open plot | |
eog nucmer.ref.plot.png | |
## SPAdes contigs vs Redundans scaffolds | |
cd run1 | |
# align | |
nucmer -p nucmer.contigs scaffolds.filled.fa ../contigs.fa | |
# generate pairwise plot | |
mummerplot --png --large nucmer.contigs.delta -p nucmer.contigs.plot | |
# open plot | |
eog nucmer.contigs.plot.png | |
## dipSPAdes consensus contigs vs reference | |
# align | |
nucmer -p nucmer.ref_dipspades ref.fa consensus_contigs.fa | |
# generate pairwise plot | |
mummerplot --png --large nucmer.ref_dipspades.delta -p nucmer.ref_dipspades.plot | |
# open plot | |
eog nucmer.ref_dipspades.plot.png | |
# the same can be done using LAST (this will work also for large genomes) | |
## Redundans scaffolds vs reference | |
# index reference genome (need to be done only once per each reference) | |
if [ ! -s ../ref.fa.suf ]; then lastdb ../ref.fa ../ref.fa; if | |
# align & generate dotplot on the fly | |
lastal -f TAB ../ref.fa scaffolds.filled.fa | last-dotplot - scaffolds.filled.fa.png | |
# open plot | |
eog scaffolds.filled.fa.png |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### | |
# All programs/scripts not disclosed in Redundans repository (i.e. fasta2diverged.py), | |
# can be found in https://github.com/lpryszcz/bin. | |
### | |
# get chromosome & select region | |
wget "http://www.ebi.ac.uk/ena/data/view/HE605202&download=fasta&filename=CANPA.fa" | |
samtools faidx CANPA.fa "ENA|HE605202|HE605202.1:100000-200000" > ref.fa | |
# get heterozygous regions | |
fasta2diverged.py -v -i ref.fa --loh 0.40 --dna -d 0.05 > ref.d05.l40.fa | |
# simulate reads | |
# generate SNPs | |
~/src/GemSIM_v1.6/GemHaps.py -r ref.fa -g '.50,0 .50,6' | |
# 2x 50,000 reads | |
## 100 bp paired-end with 600 bp insert +/- 50 bp | |
~/src/GemSIM_v1.6/GemReads.py -r ref.fa -n 50000 -g ref.txt -l 100 -m /home/lpryszcz/src/GemSIM_v1.6/models/ill100v5_p.gzip -q 33 -o 600 -u 600 -s 50 -p | |
~/src/GemSIM_v1.6/GemReads.py -r ref.d05.l40.fa -n 50000 -g ref.txt -l 100 -m /home/lpryszcz/src/GemSIM_v1.6/models/ill100v5_p.gzip -q 33 -o 600.d05.l40 -u 600 -s 50 -p | |
# 2x 10,000 reads | |
## 50 bp mate-pairs with 5 kb insert +/- 500 bp | |
~/src/GemSIM_v1.6/GemReads.py -r ref.fa -n 10000 -g ref.txt -l 50 -m /home/lpryszcz/src/GemSIM_v1.6/models/ill100v5_p.gzip -q 33 -o 5000 -u 5000 -s 500 -p | |
~/src/GemSIM_v1.6/GemReads.py -r ref.d05.l40.fa -n 10000 -g ref.txt -l 50 -m /home/lpryszcz/src/GemSIM_v1.6/models/ill100v5_p.gzip -q 33 -o 5000.d05.l40 -u 5000 -s 500 -p | |
# combine both libraries in random order | |
## 600_ | |
paste <(zcat 600_1.fastq.gz 600.d05.l40_1.fastq.gz) <(zcat 600_2.fastq.gz 600.d05.l40_2.fastq.gz) | paste - - - - | shuf | awk -F'\t' '{OFS="\n"; print $1,$3,$5,$7 | "gzip > random.600.d05.l40_1.fq.gz"; print $2,$4,$6,$8 | "gzip > random.600.d05.l40_2.fq.gz"}' | |
## 5000_ | |
paste <(zcat 5000_1.fastq.gz 600.d05.l40_1.fastq.gz) <(zcat 5000_2.fastq.gz 5000.d05.l40_2.fastq.gz) | paste - - - - | shuf | awk -F'\t' '{OFS="\n"; print $1,$3,$5,$7 | "gzip > random.5000.d05.l40_1.fq.gz"; print $2,$4,$6,$8 | "gzip > random.5000.d05.l40_2.fq.gz"}' | |
# symlinks with more handy names | |
for f in *.fq.gz; do ln -s $f `echo $f | sed 's/random.//;s/.d05.l40//'`; done | |
# mate-pairs contaminated with paired-end reads | |
#for i in 1 2; do zcat <(zcat test/5000_$i.fq.gz test/5000_$i.fq.gz | fastx2reverse_complement.py fastq | gzip) test/600_$i.fq.gz > test/mixed.$i.fq.gz; done | |
# assembly SPAdes v3.5.0 | |
dipspades.py --only-assembler -t 4 -o 600 -1 600_1.fq.gz -2 600_2.fq.gz | |
# run redundans | |
~/src/redundans/redundans.py -v -i *.fq.gz -f 600/spades/contigs.fasta -o run1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment