Skip to content

Instantly share code, notes, and snippets.

@ctb
Last active November 2, 2016 13:22
Show Gist options
  • Save ctb/124bc83c079ddcaf599fd7e46bf0d626 to your computer and use it in GitHub Desktop.
Save ctb/124bc83c079ddcaf599fd7e46bf0d626 to your computer and use it in GitHub Desktop.
Some pydoit stuff for Ethan and Matt et al.

short demonstration of pydoit for workflow organization

See Camille's pydoit for automation tutorial for details.

Briefly,

Check out make-sim-catlas -- here, I'm defining a bunch of doit tasks ('task_make_simulated_reads' etc) & parameterizing them with specific filenames. Then, I'm adding three more tasks and telling pydoit to just run 'em all.

You can see how tasks are defined here -- essentially, you define the command to run, the dependencies ('file_dep'), and the output ('targets').

So far I've been really impressed with this as an alternative to make. Note I'm still using a Makefile at a high level, but the individual tasks are defined in terms of doit.

#! /usr/bin/env python
from doit.cmd_base import TaskLoader
from doit.doit_cmd import DoitMain
from doit.task import dict_to_task
class TaskFailed(Exception):
pass
_task_count = 1
def make_task(task_dict_func):
'''Wrapper to decorate functions returning pydoit
`Task` dictionaries and have them return pydoit `Task`
objects
'''
def d_to_t(*args, **kwargs):
global _task_count
ret_dict = task_dict_func(*args, **kwargs)
if 'name' not in ret_dict:
name = "{0}.func<{1}>".format(str(_task_count), task_dict_func.__name__)
_task_count += 1
ret_dict['name'] = name
return dict_to_task(ret_dict)
return d_to_t
def run_tasks(tasks, args, config={'verbosity': 0}):
'''Given a list of `Task` objects, a list of arguments,
and a config dictionary, execute the tasks.
'''
if type(tasks) is not list:
raise TypeError('tasks must be of type list.')
class Loader(TaskLoader):
@staticmethod
def load_tasks(cmd, opt_values, pos_args):
return tasks, config
status = DoitMain(Loader()).run(args)
if status:
raise TaskFailed(status)
#! /usr/bin/env python
"""
Build a catlas from a collection of contigs/genomes.
This script will build a compact De Bruijn graph from the input contigs and
build a catlas from that.
"""
import os
from doit_utils import run_tasks
from spg_tasklib import *
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('dirname')
parser.add_argument('inp_fasta', nargs='+')
parser.add_argument('-r', '--radius', type=int, default=3)
parser.add_argument('-M', '--memory', type=float, default=1e9)
parser.add_argument('--clean', default=False, action='store_true')
args = parser.parse_args()
tasks = []
tasks.append(task_walk_dbg(args.inp_fasta, args.dirname,
memory=args.memory, label=True))
tasks.append(task_build_catlas(args.dirname, args.radius))
if args.clean:
run_tasks(tasks, ['clean'])
else:
run_tasks(tasks, ['run'])
if __name__ == '__main__':
main()
#! /usr/bin/env python
"""
Build a catlas from a set of real reads.
This script error trim a set of real reads, build a compact De Bruijn
graph from them, and build a catlas from that.
"""
import os
from doit_utils import run_tasks
from spg_tasklib import *
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('dirname')
parser.add_argument('inp_reads', nargs='+')
parser.add_argument('-r', '--radius', type=int, default=3)
parser.add_argument('-M', '--memory', type=float, default=1e9)
parser.add_argument('--clean', default=False, action='store_true')
args = parser.parse_args()
trim_files = [ t + '.abundtrim' for t in args.inp_reads ]
tasks = []
# produces trim_files
tasks.append(task_trim_reads(args.inp_reads, memory=args.memory))
# => DBG
tasks.append(task_walk_dbg(trim_files, args.dirname, memory=args.memory))
tasks.append(task_build_catlas(args.dirname, args.radius))
if args.clean:
run_tasks(tasks, ['clean'])
else:
run_tasks(tasks, ['run'])
if __name__ == '__main__':
main()
#! /usr/bin/env python
"""
Build a catlas from a set of reads simulated from input contigs.
This script will create a set of fake reads from the input contigs,
then error trim them, build a compact De Bruijn graph from them, and
build a catlas from that.
"""
import os
from doit_utils import run_tasks
from spg_tasklib import *
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('dirname')
parser.add_argument('inp_fasta', nargs='+')
parser.add_argument('-r', '--radius', type=int, default=3)
parser.add_argument('--clean', default=False, action='store_true')
args = parser.parse_args()
read_files = [ os.path.basename(t)[:-5] + 'reads.fa' for t in args.inp_fasta ]
trim_files = [ t + '.abundtrim' for t in read_files ]
tasks = []
# produces read_files
for inp_filename in args.inp_fasta:
task = task_make_simulated_reads(inp_filename)
tasks.append(task)
# produces trim_files
tasks.append(task_trim_reads(read_files))
# => DBG
tasks.append(task_walk_dbg(trim_files, args.dirname))
tasks.append(task_build_catlas(args.dirname, args.radius))
if args.clean:
run_tasks(tasks, ['clean'])
else:
run_tasks(tasks, ['run'])
if __name__ == '__main__':
main()
SPG=../../spacegraphcats
all: acido-sim-catlas acido-genome-catlas acido-reads-catlas
clean:
./make-sim-catlas.py acido-chunk-reads data/acido-chunk[12].fa.gz --clean
./make-genome-catlas.py acido-genome data/acido-chunk[12].fa.gz --clean
./recover-reads.py acido-chunk-reads data/acido-chunk1.fa.sig.dump.txt \
acido-chunk1.reads.fa.abundtrim acido-chunk1-1.extract.fa --clean
./recover-reads.py acido-chunk-reads data/acido-chunk1.fa.sig.dump.txt \
acido-chunk2.reads.fa.abundtrim acido-chunk1-2.extract.fa --clean
./recover-reads.py acido-chunk-reads data/acido-chunk2.fa.sig.dump.txt \
acido-chunk1.reads.fa.abundtrim acido-chunk2-1.extract.fa --clean
./recover-reads.py acido-chunk-reads data/acido-chunk2.fa.sig.dump.txt \
acido-chunk2.reads.fa.abundtrim acido-chunk2-2.extract.fa --clean
./make-reads-catlas.py acido-reads-catlas acido-chunk[12].reads.fa --clean
./make-genome-catlas.py 15genome data/15genome.fa.gz -M 4e9 --clean
rm -fr acido-genome acido-chunk-reads acido-reads-catlas 15genome
acido-sim-catlas:
./make-sim-catlas.py acido-chunk-reads data/acido-chunk[12].fa.gz
./recover-reads.py acido-chunk-reads data/acido-chunk1.fa.sig.dump.txt \
acido-genome-catlas:
./make-genome-catlas.py acido-genome data/acido-chunk[12].fa.gz
$(SPG)/search-for-domgraph-nodes.py \
--strategy frontier-jacc \
acido-genome 3 data/acido-chunk[12].fa.sig.dump.txt
acido-reads-catlas: acido-sim-catlas
./make-reads-catlas.py acido-reads-catlas acido-chunk[12].reads.fa
#! /usr/bin/env python
"""
Use the given signature dumpfile to extract reads from the catlas & graph.
This script searches the given catlas with the signature file, extracts
the matching cDBG nodes, and then pulls the reads that contributed to the
nodes out of the given reads file.
The signature dump file can be generated from 'sourmash dump'.
"""
import os
from doit_utils import run_tasks
from spg_tasklib import *
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('catlas')
parser.add_argument('sigfile')
parser.add_argument('readfile')
parser.add_argument('outfile')
parser.add_argument('-r', '--radius', type=int, default=3)
parser.add_argument('--clean', default=False, action='store_true')
args = parser.parse_args()
try:
os.mkdir('temp')
except OSError:
pass
#set_tempdir('./temp/')
nodes_file = 'temp/{0}.{1}.x.{2}'.format(args.catlas, args.radius,
os.path.basename(args.sigfile))
tasks = []
tasks.append(task_gimme_dbg_nodes(args.catlas, args.radius,
args.sigfile, 'gathermins2',
'--searchlevel 2', nodes_file))
tasks.append(task_gimme_reads(args.readfile, nodes_file, args.outfile))
if args.clean:
run_tasks(tasks, ['clean'])
else:
run_tasks(tasks, ['run'])
if __name__ == '__main__':
main()
import os.path
import glob
import shutil
from doit_utils import make_task
from doit.tools import run_once
from doit.task import clean_targets
NULLGRAPH_LOCATION='../../nullgraph'
SPG_LOCATION='../../spacegraphcats'
@make_task
def task_make_simulated_reads(inp_filename):
CMD_make_reads = "{0}/make-reads.py {1} -e .01 -r 100 -C 20 > {2}"
target = os.path.basename(inp_filename)[:-5] + 'reads.fa'
return dict(name='make_simulated_reads<{0}>'.format(inp_filename),
actions=[CMD_make_reads.format(NULLGRAPH_LOCATION,
inp_filename, target)],
targets=[target],
uptodate=[run_once],
file_dep=[inp_filename],
clean=[clean_targets])
@make_task
def task_trim_reads(orig_files, memory=1e9):
CMD_trim = 'trim-low-abund.py -Z 20 -C 3 -M {1} -k 31 {0}'
targets = [ os.path.basename(t) + '.abundtrim' for t in orig_files ]
name = 'task_trim_reads<{0}.{1}>'.format(",".join(orig_files),
memory)
return {'name': name,
'actions': [CMD_trim.format(" ".join(orig_files), memory)],
'targets': targets,
'uptodate': [run_once],
'file_dep': orig_files,
'clean': [clean_targets]}
@make_task
def task_walk_dbg(orig_files, output_dir, label=False, memory=1e9):
def rm_output_dir():
try:
shutil.rmtree(output_dir)
except FileNotFoundError:
pass
CMD_walk = '{0}/walk-dbg.py -k 31 -o {1} {2}'
CMD_walk += ' -M {0}'.format(memory)
if label:
CMD_walk += ' --label'
if label:
labeltxt = 'l'
else:
labeltxt = 'n'
name = 'walk_dbg<{0}.{1}.{2}.{3}>'.format(",".join(orig_files), output_dir,
memory, labeltxt)
return {'name': name,
'actions': [rm_output_dir,
CMD_walk.format(SPG_LOCATION,
output_dir, " ".join(orig_files))],
'targets': ['{0}/{0}.gxt'.format(output_dir),
'{0}/{0}.mxt'.format(output_dir) ],
'file_dep': orig_files,
'uptodate': [run_once],
'clean': [clean_targets]}
@make_task
def task_build_catlas(dirname, radius):
CMD_build = '{0}/build-catlas.py {1} {2}'
targets = [ '{0}/{0}.assignment.{1}.vxt',
'{0}/{0}.catlas.{1}.gxt',
'{0}/{0}.catlas.{1}.mxt',
'{0}/{0}.domgraph.{1}.gxt' ]
targets = [ t.format(dirname, radius) for t in targets ]
name = 'build_catlas<{0}.{1}>'.format(dirname, radius)
return {'name': name,
'actions': [CMD_build.format(SPG_LOCATION, dirname, radius)],
'targets': targets,
'uptodate': [run_once],
'file_dep': ['{0}/{0}.gxt'.format(dirname),
'{0}/{0}.mxt'.format(dirname)],
'clean': [clean_targets]}
@make_task
def task_gimme_dbg_nodes(catlasdir, radius, sigfile, strategy, args, outfile):
CMD_gimme = '{0}/gimme-dbg-nodes.py {1} {2} {3} --strategy {4} {5} -o {6}'
deps = [ '{0}/{0}.assignment.{1}.vxt',
'{0}/{0}.catlas.{1}.gxt',
'{0}/{0}.catlas.{1}.mxt',
'{0}/{0}.domgraph.{1}.gxt' ]
deps = [ t.format(catlasdir, radius) for t in deps ]
name = 'gimme_dbg_nodes<{0}.{1}.{2}.{3}.{4}>'.format(catlasdir,
radius, sigfile,
strategy,
outfile)
title = 'finding matching nodes to {0} in {1}'.format(sigfile, catlasdir)
title_fn = lambda t: title
return {'name': name,
'title': title_fn,
'actions': [CMD_gimme.format(SPG_LOCATION,
catlasdir, radius, sigfile,
strategy, args, outfile)],
'targets': [outfile],
'uptodate': [run_once],
'file_dep': deps,
'clean': [clean_targets]}
@make_task
def task_gimme_reads(readsfile, nodes_file, outfile):
CMD_gimme_reads = '{0}/gimme-reads.py {1} {2} -o {3}'
name = 'gimme_reads<{0}.{1}.{2}>'.format(readsfile, nodes_file, outfile)
title = 'extracting reads from {0}\n\t=> {1}'.format(readsfile, outfile)
title_fn = lambda t: title
return {'name': name,
'title': title_fn,
'actions': [CMD_gimme_reads.format(SPG_LOCATION,
readsfile, nodes_file, outfile)],
'targets': [outfile],
'uptodate': [run_once],
'file_dep': [readsfile, nodes_file],
'clean': [clean_targets]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment