Create a gist now

Instantly share code, notes, and snippets.

DATA = SRR013437
# Allowed errors for primers and barcodes
# Even reads containing errors in these may still be usuable.
BCDE_ERS = 2
PRMR_ERS = 3
# Nucleotides differences to cluster sequences
DIFF = 4
# Mothur trim opts
TRIMOPTS = bdiffs=$(BCDE_ERS), pdiffs=$(PRMR_ERS), minlength=$(LENGTH), processors=$(PS)
# Expected number of clusters
EXPCT = 23
all: .$(DATA).read_accurate.tab.xz.uploaded
%.read_accurate.tab: %.accurate.txt %.inaccurate.txt
sed 's/$$/ 0/' $*.accurate.txt > $@.tmp
sed 's/$$/ 1/' $*.inaccurate.txt >> $@.tmp
echo 'read bad.read' > $@
sort $@.tmp >> $@
rm $@.tmp
%.inaccurate.txt: %.accurate.txt %.unique.names
cut -f 2 $(lastword $^) | tr ',' '\n' | sort | comm -2 -3 - $< > $@
%.accurate.txt: %.correct.txt %.unique.names
join $^ | cut -f 2 -d ' ' | tr ',' '\n' | sort > $@
# Identify the unique sequences corresponding to the original reads
# The number of lines in this file should match the expectation
%.correct.txt: %.align.fasta
mothur '#pre.cluster(fasta=$<, name=$*.unique.names, diffs=$(DIFF))' > /dev/null
cut -f 1 $*.align.precluster.names | sort > $@
rm mothur.*.logfile $*.align.precluster.*
# Align the top (expected*2) unique sequences
%.align.fasta: %.counts.tab %.unique.names
head -n `echo '$(EXPCT) * 2' | bc` $< | cut -f 1 -d ' ' | xargs -I {} egrep -A 1 '{}$$' $*.unique.fasta | mafft - > $@ 2> /dev/null
# Count the number of reads mapping to each unique sequence
%.counts.tab: ./bin/count-reads %.unique.names
$^ | sort -r -n -k 2,2 > $@
%.unique.names: %.processed.fasta
mothur '#unique.seqs(fasta=$<)' > /dev/null
mv $*.processed.unique.fasta $*.unique.fasta
cat $*.processed.names | sort > $@
rm mothur.*.logfile $*.processed.names
# Remove barcode and filter sequences on length and errors
# Then trim sequences down to required length
%.processed.fasta: %.filtered.fasta primers.txt
mothur '#trim.seqs(fasta=$<,oligos=$(lastword $^),$(TRIMOPTS))' > /dev/null
fastx_trimmer -i $(basename $<).trim.fasta -o $@ -l $(LENGTH)
rm mothur.*.logfile $(basename $<).groups $(basename $<).trim.fasta
%.filtered.fastq: %.filtered.fastq.xz
xz --decompress $<
primers.txt: primers.txt.xz
xz --keep --decompress $<
primers.txt.xz:
$(INPUT_DATA)$@
include ../Makefile
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment