Skip to content

Instantly share code, notes, and snippets.

@rpetit3
Created October 18, 2017 23:08
Show Gist options
  • Save rpetit3/15fc8e596b0bd2630914f01f974664d1 to your computer and use it in GitHub Desktop.
Save rpetit3/15fc8e596b0bd2630914f01f974664d1 to your computer and use it in GitHub Desktop.
Ruffus version of FASTQ clean up
#! /usr/bin/env python
"""Clean up an input FASTQ file."""
import sys
from ruffus import *
from staphopia.helpers.time_job import time_job
from staphopia.tasks import fastq, shared
parser = cmdline.get_argparse(description='Cleanup FASTQ files')
parser.add_argument("fastq", help="Compressed FASTQ file (*.tar.gz)",)
parser.add_argument("-f2", "--fastq2", dest="fastq2",
help="Compressed FASTQ file (*.tar.gz)",)
parser.add_argument('-p', '--processors', metavar="INT", type=int, default=1,
help='Number of processors to use. (Default 1)')
parser.add_argument('--coverage', metavar="INT", type=int, default=100,
help='Coverage to subsample to.')
parser.add_argument('--paired', action='store_true', default=False,
help='Input is interleaved paired end reads.', )
parser.add_argument('--no_length_filter', action='store_true', default=False,
help='Do not filter reads based on read lengths.', )
parser.add_argument('--sample_tag', dest='sample_tag', default='sample_tag',
help='Optional: Sample tag of input. (Default sample_tag)')
parser.add_argument('--log_times', action='store_true', default=False,
help='Write task run times to file (Default: STDERR).', )
options = parser.parse_args()
NUM_CPU = str(options.processors)
OUT_DIR = 'analyses/fastq-qc/'
BBDUK_PHIX = 'logs/fastq_cleanup-bbduk-phix.txt'
BBDUK_ADAPTER = 'logs/fastq_cleanup-bbduk-adapter.txt'
BBDUK_ECC = 'logs/fastq_cleanup-bbduk-ecc.txt'
TIME_LOG = sys.stderr
if options.log_times:
TIME_LOG = 'logs/time/fastq_cleanup.txt'
# Pipeline --------------------------------------------------------------------
@mkdir('logs/time')
@mkdir('analyses/fastq-stats')
@transform(
options.fastq, regex(r"(.*)"),
"analyses/fastq-stats/{0}.original.fastq.json".format(options.sample_tag)
)
@time_job(TIME_LOG, new_stream=True)
def raw_stats(input_file, output_file):
"""Calculate sequence stats of the input FASTQ."""
fastq.stats(input_file, output_file, fastq2=options.fastq2)
@follows(raw_stats)
@transform(raw_stats, regex(r"(.*)"),
'{0}.cleanup.fastq.gz'.format(options.sample_tag))
@time_job(TIME_LOG)
def cleanup(input_file, output_file):
"""Clean up FASTQ based on statistics."""
shared.run_command(['rm', '-rf', OUT_DIR])
shared.run_command(['mkdir', '-p', OUT_DIR])
# Filter out phiX and adapter sequences
nophix = fastq.filter_phix(options.fastq, NUM_CPU, OUT_DIR, BBDUK_PHIX,
fastq2=options.fastq2)
noadapter = fastq.filter_adapters(
nophix['fastq'], NUM_CPU, OUT_DIR, BBDUK_ADAPTER,
fastq2=nophix['fastq2']
)
stats_file = "analyses/fastq-stats/{0}.post-adapter.fastq.json".format(
options.sample_tag
)
fastq.stats(noadapter['fastq'], stats_file, fastq2=noadapter['fastq2'],
compressed=False)
ecc = fastq.error_correct(
noadapter['fastq'], NUM_CPU, OUT_DIR, fastq2=noadapter['fastq2']
)
stats_file = "analyses/fastq-stats/{0}.post-ecc.fastq.json".format(
options.sample_tag
)
fastq.stats(ecc['fastq'], stats_file, fastq2=ecc['fastq2'],
compressed=False)
fastq.cleanup(ecc['fastq'], stats_file, options.paired,
options.no_length_filter, output_file,
fastq2=ecc['fastq2'],
coverage=options.coverage)
shared.run_command(['rm', '-rf', OUT_DIR])
@follows(cleanup)
@transform(cleanup, regex(r"(.*).gz"), r"analyses/fastq-stats/\1.json")
@time_job(TIME_LOG)
def cleanup_stats(input_file, output_file):
"""Calculate sequence stats of the cleaned up FASTQ."""
fastq.stats(input_file, output_file)
# -----------------------------------------------------------------------------
pipeline_run(exceptions_terminate_immediately=True, verbose=5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment