Created
October 18, 2017 23:08
-
-
Save rpetit3/15fc8e596b0bd2630914f01f974664d1 to your computer and use it in GitHub Desktop.
Ruffus version of FASTQ clean up
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
"""Clean up an input FASTQ file.""" | |
import sys | |
from ruffus import * | |
from staphopia.helpers.time_job import time_job | |
from staphopia.tasks import fastq, shared | |
parser = cmdline.get_argparse(description='Cleanup FASTQ files') | |
parser.add_argument("fastq", help="Compressed FASTQ file (*.tar.gz)",) | |
parser.add_argument("-f2", "--fastq2", dest="fastq2", | |
help="Compressed FASTQ file (*.tar.gz)",) | |
parser.add_argument('-p', '--processors', metavar="INT", type=int, default=1, | |
help='Number of processors to use. (Default 1)') | |
parser.add_argument('--coverage', metavar="INT", type=int, default=100, | |
help='Coverage to subsample to.') | |
parser.add_argument('--paired', action='store_true', default=False, | |
help='Input is interleaved paired end reads.', ) | |
parser.add_argument('--no_length_filter', action='store_true', default=False, | |
help='Do not filter reads based on read lengths.', ) | |
parser.add_argument('--sample_tag', dest='sample_tag', default='sample_tag', | |
help='Optional: Sample tag of input. (Default sample_tag)') | |
parser.add_argument('--log_times', action='store_true', default=False, | |
help='Write task run times to file (Default: STDERR).', ) | |
options = parser.parse_args() | |
NUM_CPU = str(options.processors) | |
OUT_DIR = 'analyses/fastq-qc/' | |
BBDUK_PHIX = 'logs/fastq_cleanup-bbduk-phix.txt' | |
BBDUK_ADAPTER = 'logs/fastq_cleanup-bbduk-adapter.txt' | |
BBDUK_ECC = 'logs/fastq_cleanup-bbduk-ecc.txt' | |
TIME_LOG = sys.stderr | |
if options.log_times: | |
TIME_LOG = 'logs/time/fastq_cleanup.txt' | |
# Pipeline -------------------------------------------------------------------- | |
@mkdir('logs/time') | |
@mkdir('analyses/fastq-stats') | |
@transform( | |
options.fastq, regex(r"(.*)"), | |
"analyses/fastq-stats/{0}.original.fastq.json".format(options.sample_tag) | |
) | |
@time_job(TIME_LOG, new_stream=True) | |
def raw_stats(input_file, output_file): | |
"""Calculate sequence stats of the input FASTQ.""" | |
fastq.stats(input_file, output_file, fastq2=options.fastq2) | |
@follows(raw_stats) | |
@transform(raw_stats, regex(r"(.*)"), | |
'{0}.cleanup.fastq.gz'.format(options.sample_tag)) | |
@time_job(TIME_LOG) | |
def cleanup(input_file, output_file): | |
"""Clean up FASTQ based on statistics.""" | |
shared.run_command(['rm', '-rf', OUT_DIR]) | |
shared.run_command(['mkdir', '-p', OUT_DIR]) | |
# Filter out phiX and adapter sequences | |
nophix = fastq.filter_phix(options.fastq, NUM_CPU, OUT_DIR, BBDUK_PHIX, | |
fastq2=options.fastq2) | |
noadapter = fastq.filter_adapters( | |
nophix['fastq'], NUM_CPU, OUT_DIR, BBDUK_ADAPTER, | |
fastq2=nophix['fastq2'] | |
) | |
stats_file = "analyses/fastq-stats/{0}.post-adapter.fastq.json".format( | |
options.sample_tag | |
) | |
fastq.stats(noadapter['fastq'], stats_file, fastq2=noadapter['fastq2'], | |
compressed=False) | |
ecc = fastq.error_correct( | |
noadapter['fastq'], NUM_CPU, OUT_DIR, fastq2=noadapter['fastq2'] | |
) | |
stats_file = "analyses/fastq-stats/{0}.post-ecc.fastq.json".format( | |
options.sample_tag | |
) | |
fastq.stats(ecc['fastq'], stats_file, fastq2=ecc['fastq2'], | |
compressed=False) | |
fastq.cleanup(ecc['fastq'], stats_file, options.paired, | |
options.no_length_filter, output_file, | |
fastq2=ecc['fastq2'], | |
coverage=options.coverage) | |
shared.run_command(['rm', '-rf', OUT_DIR]) | |
@follows(cleanup) | |
@transform(cleanup, regex(r"(.*).gz"), r"analyses/fastq-stats/\1.json") | |
@time_job(TIME_LOG) | |
def cleanup_stats(input_file, output_file): | |
"""Calculate sequence stats of the cleaned up FASTQ.""" | |
fastq.stats(input_file, output_file) | |
# ----------------------------------------------------------------------------- | |
pipeline_run(exceptions_terminate_immediately=True, verbose=5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment