Last active
August 8, 2016 11:24
-
-
Save claczny/66118554b513d1cb404cf505d5449686 to your computer and use it in GitHub Desktop.
Makefile to compare bash5tools and dextractor module, both for the extraction of FASTA-formatted reads from PacBio data. Includes downloading of the *large* raw PacBio data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SHELL=/bin/bash | |
MOVIE=m151020_151817_00127_c100889452550000001823187103261622_s1_p0 | |
FASTA=$(MOVIE).fasta | |
#QUIVA=$(MOVIE).quiva | |
BAS_H5=$(MOVIE).bas.h5 | |
BAX_1_H5=$(MOVIE).1.bax.h5 | |
BAX_2_H5=$(MOVIE).2.bax.h5 | |
BAX_3_H5=$(MOVIE).3.bax.h5 | |
METADATA_XML=$(MOVIE).metadata.xml | |
BASH5TOOLS_BIN=source ../hdf5/bin/activate; time bash5tools.py | |
BASH5TOOLS_OPTS=--readType subreads --outType fasta --minReadScore 0.75 --minLength 500 --outFilePrefix | |
# For details on the default options, e.g., min. read length or min. quality, see https://dazzlerblog.wordpress.com/command-guides/dextractor-command-guide/ | |
DEXTRACTOR_BIN=time /path/to/DEXTRACTOR-1.0/bin/dextract | |
##### | |
# BEAUTY TARGETS | |
##### | |
.PHONY: \ | |
fetch_raw_data\ | |
extract_fasta_reads_pbh5tools\ | |
extract_fasta_reads_pbh5tools_no_intermediate\ | |
extract_fasta_reads_dextractor\ | |
cmp_pbh5tools_dextractor | |
all: cmp_pbh5tools_dextractor | |
fetch_raw_data: $(BAS_H5) $(BAX_1_H5) $(BAX_2_H5) $(BAX_3_H5) $(METADATA_XML) | |
extract_fasta_reads_pbh5tools: $(FASTA:=_pbh5tools) | |
extract_fasta_reads_pbh5tools_no_intermediate: $(FASTA:=_pbh5tools) | |
extract_fasta_reads_dextractor: $(FASTA:=_dextractor) | |
cmp_pbh5tools_dextractor: $(FASTA:=_pbh5tools) $(FASTA:=_pbh5tools_no_intermediate) $(FASTA:=_dextractor) | |
# Compare subread counts | |
grep -c "^>" $^ | |
# Compare the actual contained sequences | |
for i in $^; do echo "$$i"; pullseq -i "$$i" -m 500 | grep -v "^>" | tr '[:upper:]' '[:lower:]' | md5sum; done | |
##### | |
# ACTUAL TARGETS | |
##### | |
# N.B. That this is *NOT* generic as the URL does *NOT* only depend on the movie name but also on the ERA RUN-ID | |
%.bas.h5 %.1.bax.h5 %.2.bax.h5 %.3.bax.h5 %.metadata.xml: | |
wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.bas.h5 | |
wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.1.bax.h5 | |
wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.2.bax.h5 | |
wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.3.bax.h5 | |
wget ftp://ftp.sra.ebi.ac.uk/vol1/ERA552/ERA552098/pacbio_hdf5/$*.metadata.xml | |
%.fasta_pbh5tools: %.1.bax.h5 %.2.bax.h5 %.3.bax.h5 | |
# bash5tools - w/ intermediate files | |
$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $*.1 $*.1.bax.h5 | |
$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $*.2 $*.2.bax.h5 | |
$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $*.3 $*.3.bax.h5 | |
cat $*.[1-3].fasta > $@ | |
@echo | |
# Uses bash5tools.py from pbh5tools. Much slower than dextractor | |
%.fasta_pbh5tools_no_intermediate: %.bas.h5 | |
# bash5tools - w/o intermediate files | |
$(BASH5TOOLS_BIN) $(BASH5TOOLS_OPTS) $*_bash5tools $^ | |
mv $*_bash5tools.fasta $@ | |
@echo | |
# The quality values are *not* used here and, thus, only the fasta sequences are saved. | |
# # I.e., the `-o` option is *not* used but the output written to stdout. | |
%.fasta_dextractor: %.1.bax.h5 %.2.bax.h5 %.3.bax.h5 | |
#dextractor - w/o intermediate files | |
$(DEXTRACTOR_BIN) $^ > $@ | |
@echo |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Necessary adjustments include to specify the path to a working
dextractor
binary, to make sure thath5py
is installed and working so that thepbh5tools
(https://github.com/PacificBiosciences/pbh5tools) can be correctly installed. I installedh5py
by creating a virtual python environment and installingh5py
viapip
. Then I sourced that environment and installed thepbh5tools
, which includebash5tools.py
, according to https://github.com/PacificBiosciences/pbh5tools/blob/master/doc/index.rst#installation.The data which will be downloaded here is about 12 GB. The extracted FASTA file, i.e., no quality values etc., is ~1 GB.
The
BASH5TOOLS_OPTS
--minReadScore 0.75 --minLength 500
are chosen to match thedextractor
defaults.N.B. The Makefile presents two ways to use
bash5tools.py
: One, which creates an intermediate FASTA file for each .bax.h5 file, and another, which works directly on the .bas.h5 file, i.e., creates no intermediate file. The results of all three variants (bash5tools.py
w/ intermediate files,bash5tools.py
w/o intermediate files, anddextractor
) are essentially the same (formatting and FASTA headers may differ), however,dextractor
is by far the fastest.