rpetit3/rufus-cleanup.py

## rufus-cleanup.py
#! /usr/bin/env python
"""Clean up an input FASTQ file."""
import sys

from ruffus import *

from staphopia.helpers.time_job import time_job
from staphopia.tasks import fastq, shared

parser = cmdline.get_argparse(description='Cleanup FASTQ files')
parser.add_argument("fastq", help="Compressed FASTQ file (*.tar.gz)",)
parser.add_argument("-f2", "--fastq2", dest="fastq2",
                    help="Compressed FASTQ file (*.tar.gz)",)
parser.add_argument('-p', '--processors', metavar="INT", type=int, default=1,
                    help='Number of processors to use. (Default 1)')
parser.add_argument('--coverage', metavar="INT", type=int, default=100,
                    help='Coverage to subsample to.')
parser.add_argument('--paired', action='store_true', default=False,
                    help='Input is interleaved paired end reads.', )
parser.add_argument('--no_length_filter', action='store_true', default=False,
                    help='Do not filter reads based on read lengths.', )
parser.add_argument('--sample_tag', dest='sample_tag', default='sample_tag',
                    help='Optional: Sample tag of input. (Default sample_tag)')
parser.add_argument('--log_times', action='store_true', default=False,
                    help='Write task run times to file (Default: STDERR).', )
options = parser.parse_args()

NUM_CPU = str(options.processors)
OUT_DIR = 'analyses/fastq-qc/'
BBDUK_PHIX = 'logs/fastq_cleanup-bbduk-phix.txt'
BBDUK_ADAPTER = 'logs/fastq_cleanup-bbduk-adapter.txt'
BBDUK_ECC = 'logs/fastq_cleanup-bbduk-ecc.txt'
TIME_LOG = sys.stderr
if options.log_times:
    TIME_LOG = 'logs/time/fastq_cleanup.txt'
# Pipeline --------------------------------------------------------------------


@mkdir('logs/time')
@mkdir('analyses/fastq-stats')
@transform(
    options.fastq, regex(r"(.*)"),
    "analyses/fastq-stats/{0}.original.fastq.json".format(options.sample_tag)
)
@time_job(TIME_LOG, new_stream=True)
def raw_stats(input_file, output_file):
    """Calculate sequence stats of the input FASTQ."""
    fastq.stats(input_file, output_file, fastq2=options.fastq2)


@follows(raw_stats)
@transform(raw_stats, regex(r"(.*)"),
           '{0}.cleanup.fastq.gz'.format(options.sample_tag))
@time_job(TIME_LOG)
def cleanup(input_file, output_file):
    """Clean up FASTQ based on statistics."""
    shared.run_command(['rm', '-rf', OUT_DIR])
    shared.run_command(['mkdir', '-p', OUT_DIR])

    # Filter out phiX and adapter sequences
    nophix = fastq.filter_phix(options.fastq, NUM_CPU, OUT_DIR, BBDUK_PHIX,
                               fastq2=options.fastq2)
    noadapter = fastq.filter_adapters(
        nophix['fastq'], NUM_CPU, OUT_DIR, BBDUK_ADAPTER,
        fastq2=nophix['fastq2']
    )

    stats_file = "analyses/fastq-stats/{0}.post-adapter.fastq.json".format(
        options.sample_tag
    )
    fastq.stats(noadapter['fastq'], stats_file, fastq2=noadapter['fastq2'],
                compressed=False)

    ecc = fastq.error_correct(
        noadapter['fastq'], NUM_CPU, OUT_DIR, fastq2=noadapter['fastq2']
    )

    stats_file = "analyses/fastq-stats/{0}.post-ecc.fastq.json".format(
        options.sample_tag
    )
    fastq.stats(ecc['fastq'], stats_file, fastq2=ecc['fastq2'],
                compressed=False)
    fastq.cleanup(ecc['fastq'], stats_file, options.paired,
                  options.no_length_filter, output_file,
                  fastq2=ecc['fastq2'],
                  coverage=options.coverage)

    shared.run_command(['rm', '-rf', OUT_DIR])


@follows(cleanup)
@transform(cleanup, regex(r"(.*).gz"), r"analyses/fastq-stats/\1.json")
@time_job(TIME_LOG)
def cleanup_stats(input_file, output_file):
    """Calculate sequence stats of the cleaned up FASTQ."""
    fastq.stats(input_file, output_file)


# -----------------------------------------------------------------------------
pipeline_run(exceptions_terminate_immediately=True, verbose=5)
	#! /usr/bin/env python
	"""Clean up an input FASTQ file."""
	import sys

	from ruffus import *

	from staphopia.helpers.time_job import time_job
	from staphopia.tasks import fastq, shared

	parser = cmdline.get_argparse(description='Cleanup FASTQ files')
	parser.add_argument("fastq", help="Compressed FASTQ file (*.tar.gz)",)
	parser.add_argument("-f2", "--fastq2", dest="fastq2",
	help="Compressed FASTQ file (*.tar.gz)",)
	parser.add_argument('-p', '--processors', metavar="INT", type=int, default=1,
	help='Number of processors to use. (Default 1)')
	parser.add_argument('--coverage', metavar="INT", type=int, default=100,
	help='Coverage to subsample to.')
	parser.add_argument('--paired', action='store_true', default=False,
	help='Input is interleaved paired end reads.', )
	parser.add_argument('--no_length_filter', action='store_true', default=False,
	help='Do not filter reads based on read lengths.', )
	parser.add_argument('--sample_tag', dest='sample_tag', default='sample_tag',
	help='Optional: Sample tag of input. (Default sample_tag)')
	parser.add_argument('--log_times', action='store_true', default=False,
	help='Write task run times to file (Default: STDERR).', )
	options = parser.parse_args()

	NUM_CPU = str(options.processors)
	OUT_DIR = 'analyses/fastq-qc/'
	BBDUK_PHIX = 'logs/fastq_cleanup-bbduk-phix.txt'
	BBDUK_ADAPTER = 'logs/fastq_cleanup-bbduk-adapter.txt'
	BBDUK_ECC = 'logs/fastq_cleanup-bbduk-ecc.txt'
	TIME_LOG = sys.stderr
	if options.log_times:
	TIME_LOG = 'logs/time/fastq_cleanup.txt'
	# Pipeline --------------------------------------------------------------------


	@mkdir('logs/time')
	@mkdir('analyses/fastq-stats')
	@transform(
	options.fastq, regex(r"(.*)"),
	"analyses/fastq-stats/{0}.original.fastq.json".format(options.sample_tag)
	)
	@time_job(TIME_LOG, new_stream=True)
	def raw_stats(input_file, output_file):
	"""Calculate sequence stats of the input FASTQ."""
	fastq.stats(input_file, output_file, fastq2=options.fastq2)


	@follows(raw_stats)
	@transform(raw_stats, regex(r"(.*)"),
	'{0}.cleanup.fastq.gz'.format(options.sample_tag))
	@time_job(TIME_LOG)
	def cleanup(input_file, output_file):
	"""Clean up FASTQ based on statistics."""
	shared.run_command(['rm', '-rf', OUT_DIR])
	shared.run_command(['mkdir', '-p', OUT_DIR])

	# Filter out phiX and adapter sequences
	nophix = fastq.filter_phix(options.fastq, NUM_CPU, OUT_DIR, BBDUK_PHIX,
	fastq2=options.fastq2)
	noadapter = fastq.filter_adapters(
	nophix['fastq'], NUM_CPU, OUT_DIR, BBDUK_ADAPTER,
	fastq2=nophix['fastq2']
	)

	stats_file = "analyses/fastq-stats/{0}.post-adapter.fastq.json".format(
	options.sample_tag
	)
	fastq.stats(noadapter['fastq'], stats_file, fastq2=noadapter['fastq2'],
	compressed=False)

	ecc = fastq.error_correct(
	noadapter['fastq'], NUM_CPU, OUT_DIR, fastq2=noadapter['fastq2']
	)

	stats_file = "analyses/fastq-stats/{0}.post-ecc.fastq.json".format(
	options.sample_tag
	)
	fastq.stats(ecc['fastq'], stats_file, fastq2=ecc['fastq2'],
	compressed=False)
	fastq.cleanup(ecc['fastq'], stats_file, options.paired,
	options.no_length_filter, output_file,
	fastq2=ecc['fastq2'],
	coverage=options.coverage)

	shared.run_command(['rm', '-rf', OUT_DIR])


	@follows(cleanup)
	@transform(cleanup, regex(r"(.*).gz"), r"analyses/fastq-stats/\1.json")
	@time_job(TIME_LOG)
	def cleanup_stats(input_file, output_file):
	"""Calculate sequence stats of the cleaned up FASTQ."""
	fastq.stats(input_file, output_file)


	# -----------------------------------------------------------------------------
	pipeline_run(exceptions_terminate_immediately=True, verbose=5)