claczny/Makefile.cmp_bash5tools_dextractor

## Makefile.cmp_bash5tools_dextractor
SHELL=/bin/bash

MOVIE=m151020_151817_00127_c100889452550000001823187103261622_s1_p0
FASTA=$(MOVIE).fasta
#QUIVA=$(MOVIE).quiva
BAS_H5=$(MOVIE).bas.h5
BAX_1_H5=$(MOVIE).1.bax.h5
BAX_2_H5=$(MOVIE).2.bax.h5
BAX_3_H5=$(MOVIE).3.bax.h5
METADATA_XML=$(MOVIE).metadata.xml

BASH5TOOLS_BIN=source ../hdf5/bin/activate; time bash5tools.py
BASH5TOOLS_OPTS=--readType subreads --outType fasta --minReadScore 0.75 --minLength 500 --outFilePrefix
# For details on the default options, e.g., min. read length or min. quality, see https://dazzlerblog.wordpress.com/command-guides/dextractor-command-guide/
DEXTRACTOR_BIN=time /path/to/DEXTRACTOR-1.0/bin/dextract

#####
# BEAUTY TARGETS
#####
.PHONY: \
 fetch_raw_data\
 extract_fasta_reads_pbh5tools\
 extract_fasta_reads_pbh5tools_no_intermediate\
 extract_fasta_reads_dextractor\
 cmp_pbh5tools_dextractor

all: cmp_pbh5tools_dextractor

fetch_raw_data: $(BAS_H5) $(BAX_1_H5) $(BAX_2_H5) $(BAX_3_H5) $(METADATA_XML)
extract_fasta_reads_pbh5tools: $(FASTA:=_pbh5tools)
extract_fasta_reads_pbh5tools_no_intermediate: $(FASTA:=_pbh5tools)
extract_fasta_reads_dextractor: $(FASTA:=_dextractor)

cmp_pbh5tools_dextractor: $(FASTA:=_pbh5tools) $(FASTA:=_pbh5tools_no_intermediate) $(FASTA:=_dextractor)
	# Compare subread counts
	grep -c "^>" $^
	# Compare the actual contained sequences
	for i in $^; do echo "$$i"; pullseq -i "$$i" -m 500 | grep -v "^>" | tr '[:upper:]' '[:lower:]' | md5sum; done

#####
# ACTUAL TARGETS
#####
# N.B. That this is *NOT* generic as the URL does *NOT* only depend on the movie name but also on the ERA RUN-ID
%.bas.h5 %.1.bax.h5 %.2.bax.h5 %.3.bax.h5 %.metadata.xml:
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.bas.h5
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.1.bax.h5
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.2.bax.h5
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.3.bax.h5
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.metadata.xml

%.fasta_pbh5tools: %.1.bax.h5 %.2.bax.h5 %.3.bax.h5
	# bash5tools - w/ intermediate files
	$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $*.1 $*.1.bax.h5
	$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $*.2 $*.2.bax.h5
	$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $*.3 $*.3.bax.h5
	cat $*.[1-3].fasta > $@
	@echo

# Uses bash5tools.py from pbh5tools. Much slower than dextractor
%.fasta_pbh5tools_no_intermediate: %.bas.h5
	# bash5tools - w/o intermediate files
	$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $*_bash5tools $^
	mv $*_bash5tools.fasta $@
	@echo

# The quality values are *not* used here and, thus, only the fasta sequences are saved.
# # I.e., the `-o` option is *not* used but the output written to stdout.
%.fasta_dextractor: %.1.bax.h5 %.2.bax.h5 %.3.bax.h5
	#dextractor - w/o intermediate files
	$(DEXTRACTOR_BIN) $^ > $@
	@echo
	SHELL=/bin/bash

	MOVIE=m151020_151817_00127_c100889452550000001823187103261622_s1_p0
	FASTA=$(MOVIE).fasta
	#QUIVA=$(MOVIE).quiva
	BAS_H5=$(MOVIE).bas.h5
	BAX_1_H5=$(MOVIE).1.bax.h5
	BAX_2_H5=$(MOVIE).2.bax.h5
	BAX_3_H5=$(MOVIE).3.bax.h5
	METADATA_XML=$(MOVIE).metadata.xml

	BASH5TOOLS_BIN=source ../hdf5/bin/activate; time bash5tools.py
	BASH5TOOLS_OPTS=--readType subreads --outType fasta --minReadScore 0.75 --minLength 500 --outFilePrefix
	# For details on the default options, e.g., min. read length or min. quality, see https://dazzlerblog.wordpress.com/command-guides/dextractor-command-guide/
	DEXTRACTOR_BIN=time /path/to/DEXTRACTOR-1.0/bin/dextract

	#####
	# BEAUTY TARGETS
	#####
	.PHONY: \
	fetch_raw_data\
	extract_fasta_reads_pbh5tools\
	extract_fasta_reads_pbh5tools_no_intermediate\
	extract_fasta_reads_dextractor\
	cmp_pbh5tools_dextractor

	all: cmp_pbh5tools_dextractor

	fetch_raw_data: $(BAS_H5) $(BAX_1_H5) $(BAX_2_H5) $(BAX_3_H5) $(METADATA_XML)
	extract_fasta_reads_pbh5tools: $(FASTA:=_pbh5tools)
	extract_fasta_reads_pbh5tools_no_intermediate: $(FASTA:=_pbh5tools)
	extract_fasta_reads_dextractor: $(FASTA:=_dextractor)

	cmp_pbh5tools_dextractor: $(FASTA:=_pbh5tools) $(FASTA:=_pbh5tools_no_intermediate) $(FASTA:=_dextractor)
	# Compare subread counts
	grep -c "^>" $^
	# Compare the actual contained sequences
	for i in $^; do echo "$$i"; pullseq -i "$$i" -m 500 \| grep -v "^>" \| tr '[:upper:]' '[:lower:]' \| md5sum; done

	#####
	# ACTUAL TARGETS
	#####
	# N.B. That this is NOT generic as the URL does NOT only depend on the movie name but also on the ERA RUN-ID
	%.bas.h5 %.1.bax.h5 %.2.bax.h5 %.3.bax.h5 %.metadata.xml:
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.bas.h5
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.1.bax.h5
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.2.bax.h5
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.3.bax.h5
	wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.metadata.xml

	%.fasta_pbh5tools: %.1.bax.h5 %.2.bax.h5 %.3.bax.h5
	# bash5tools - w/ intermediate files
	$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $.1 $.1.bax.h5
	$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $.2 $.2.bax.h5
	$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $.3 $.3.bax.h5
	cat $*.[1-3].fasta > $@
	@echo

	# Uses bash5tools.py from pbh5tools. Much slower than dextractor
	%.fasta_pbh5tools_no_intermediate: %.bas.h5
	# bash5tools - w/o intermediate files
	$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $*_bash5tools $^
	mv $*_bash5tools.fasta $@
	@echo

	# The quality values are not used here and, thus, only the fasta sequences are saved.
	# # I.e., the `-o` option is not used but the output written to stdout.
	%.fasta_dextractor: %.1.bax.h5 %.2.bax.h5 %.3.bax.h5
	#dextractor - w/o intermediate files
	$(DEXTRACTOR_BIN) $^ > $@
	@echo