jasongallant/illuminator.py

## illuminator.py
import os
import sys
import glob
import shutil
import argparse
import subprocess
import multiprocessing
import itertools

class FullPaths(argparse.Action):
    """Expand user- and relative-paths"""
    def __call__(self, parser, namespace, values, option_string=None):
        setattr(namespace, self.dest, os.path.abspath(os.path.expanduser(values)))


def get_args():
    parser = argparse.ArgumentParser(description='Pre-process Illumina reads')
    parser.add_argument('input', help='The input directory', action=FullPaths)
    parser.add_argument('output', help='The output directory', action=FullPaths)
    parser.add_argument('adaptors', help='The adaptor directory', action=FullPaths)
    parser.add_argument('--cores',
            type=int,
            default = 1,
            help='Number of cores to use.')

    return parser.parse_args()

def fastx_trimmer_runner(args):
    inpt,output=args
    basedir = os.path.split(inpt)[0]
    inbase = os.path.basename(inpt)
    infile = inpt

    outpth = os.path.join(output)
    outfile= os.path.join(outpth,inbase)
    outpth = open(os.path.join(outpth, inbase), 'wb')


    cmd = ['fastx_trimmer', '-Q', '33','-f','9','-i', infile, '-o', outfile]

    proc1 = subprocess.Popen(cmd)
    output = proc1.communicate()
    outpth.close()
    sys.stdout.write(".")
    sys.stdout.flush()

def chop_basepairs(pool,inputdirectory,outputdirectory):

    in_files = [filename for filename in glob.glob(inputdirectory+"/*.fastq")]
    os.mkdir(os.path.join(outputdirectory, 'fastx-trimmed'))
    out_files=os.path.join(outputdirectory, 'fastx-trimmed')


    sys.stdout.write("\nTrimming first 8 basepairs ")
    sys.stdout.flush()

    pairs = [item for item in itertools.izip_longest(in_files,[],fillvalue=out_files)]

    if pool != None:
        pool.map(fastx_trimmer_runner, pairs)
    else:
        [fastx_trimmer_runner(inpt) for inpt in pairs]
    return

def scythe_runner(args):

    inputfile,outputfile,adaptorfile=args

    basedir = os.path.split(inputfile)[0]
    adapters = os.path.join(adaptorfile, 'illumina_adapters.fa')
    inbase = os.path.basename(inputfile)
    infile = inputfile

    outpth = os.path.join(outputfile)
    outfile = os.path.join(outpth,inbase)
    outpth = open(os.path.join(outpth, inbase), 'wb')


    statpth = os.path.split(outputfile)[0]
    statpth = os.path.join(statpth, 'stats')

    statpth = open(os.path.join(statpth, inbase.strip(".fastq") + '.txt'), 'w')


    cmd = ['scythe', '-a', adapters, '-q', 'sanger', infile,'-o',outfile]

    proc1 = subprocess.Popen(cmd, stdout = statpth, stderr = statpth)
    output = proc1.communicate()
    outpth.close()
    statpth.close()
    sys.stdout.write(".")
    sys.stdout.flush()

def trim_adapter_sequences(pool,indirectory,outdirectory,adaptordirectory):

    in_files = [filename for filename in glob.glob(outdirectory+"/fastx-trimmed/*.fastq")]
    os.mkdir(os.path.join(outdirectory, 'adapter-trimmed'))
    out_files=os.path.join(outdirectory,'adapter-trimmed')
    os.mkdir(os.path.join(outdirectory, 'stats'))
    sys.stdout.write("\nTrimming adapter contamination")
    sys.stdout.flush()

    pairs = [item for item in itertools.izip(in_files,itertools.repeat(out_files),itertools.repeat(adaptordirectory))]


    if pool != None:
        pool.map(scythe_runner, pairs)
    else:
        [scythe_runner(pairs) for pairs in in_files]
    return

def zip_se_files(directory):
    directory = directory.rstrip("/")
    pairs = []

    test= enumerate(glob.glob(directory+"/*.fastq"))

    for count, fin in enumerate(glob.glob(directory+"/*.fastq")):
        if '_R1' in os.path.split(fin)[-1]:
            reads_1 = fin
            reads_2 = fin.replace('_R1', '_R2')
            pairs.append((reads_1, reads_2))

    return pairs

def sickle_se_runner(inpt):
    # infiles
    r1, r2 = inpt

    # outfiles
    out1 = [pth.replace("adapter-trimmed", "qual-trimmed") for pth in inpt]
    out1 = os.path.splitext(out1)[0] + ".adapter-trimmed.fastq"

    basedir = '/'.join(r1.split("/")[:-2])

    # create output file for stats
    statfile =  r1.replace("adapter-trimmed","stats")
    statfile = os.path.splitext(statfile)[0] + ".sickle-trim-stats.txt"
    statfile = open(statfile, 'w')

    # command for sickle (DROPPING ANY Ns)
    cmd = ["sickle", "se", "-f", r1, "-t", "sanger", "-o", out1, "-n"]
    print cmd
    proc1 = subprocess.Popen(cmd, stdout=statfile)
    proc1.communicate()
    statfile.close()
    sys.stdout.write(".")
    sys.stdout.flush()

def trim_low_qual_reads(pool, outputpath, pe =True):

    sys.stdout.write("\nTrimming low quality reads")
    sys.stdout.flush()

    # create input, output, and stats directories
    outdir = os.path.join(outputpath, "qual-trimmed")
    statdir = os.path.join(outputpath, "stats")
    inpth = os.path.join(outputpath,"adapter-trimmed")

    # check that directories don't exists
    if os.path.exists(outdir) == False:
        os.mkdir(outdir)

    if os.path.exists(statdir) == False:
        os.mkdir(statdir)

    if os.path.exists(inpth) == False:
        os.mkdir(inpth)

    if pe == True:
        in_files = zip_se_files(inpth)

    print in_files

    # map files to sickle runner

    if pool != None:
        pool.map(sickle_se_runner, in_files)
    else:
        [sickle_se_runner(inpt) for inpt in in_files]

    sys.stdout.write("\n\n")
    sys.stdout.flush()
    return

def main():
    args = get_args()

    # auto configure cores
    if args.cores > 1:
        cores = multiprocessing.cpu_count()
        pool = multiprocessing.Pool(cores)
    else:
        pool = None

    #trim adaptor contamination reads
    #chop_basepairs(pool,args.input,args.output)
    #trim_adapter_sequences(pool,args.input,args.output,args.adaptors)
    trim_low_qual_reads(pool,args.output)

if __name__ == '__main__':
    main()
	import os
	import sys
	import glob
	import shutil
	import argparse
	import subprocess
	import multiprocessing
	import itertools

	class FullPaths(argparse.Action):
	"""Expand user- and relative-paths"""
	def __call__(self, parser, namespace, values, option_string=None):
	setattr(namespace, self.dest, os.path.abspath(os.path.expanduser(values)))


	def get_args():
	parser = argparse.ArgumentParser(description='Pre-process Illumina reads')
	parser.add_argument('input', help='The input directory', action=FullPaths)
	parser.add_argument('output', help='The output directory', action=FullPaths)
	parser.add_argument('adaptors', help='The adaptor directory', action=FullPaths)
	parser.add_argument('--cores',
	type=int,
	default = 1,
	help='Number of cores to use.')

	return parser.parse_args()

	def fastx_trimmer_runner(args):
	inpt,output=args
	basedir = os.path.split(inpt)[0]
	inbase = os.path.basename(inpt)
	infile = inpt

	outpth = os.path.join(output)
	outfile= os.path.join(outpth,inbase)
	outpth = open(os.path.join(outpth, inbase), 'wb')


	cmd = ['fastx_trimmer', '-Q', '33','-f','9','-i', infile, '-o', outfile]

	proc1 = subprocess.Popen(cmd)
	output = proc1.communicate()
	outpth.close()
	sys.stdout.write(".")
	sys.stdout.flush()

	def chop_basepairs(pool,inputdirectory,outputdirectory):

	in_files = [filename for filename in glob.glob(inputdirectory+"/*.fastq")]
	os.mkdir(os.path.join(outputdirectory, 'fastx-trimmed'))
	out_files=os.path.join(outputdirectory, 'fastx-trimmed')


	sys.stdout.write("\nTrimming first 8 basepairs ")
	sys.stdout.flush()

	pairs = [item for item in itertools.izip_longest(in_files,[],fillvalue=out_files)]

	if pool != None:
	pool.map(fastx_trimmer_runner, pairs)
	else:
	[fastx_trimmer_runner(inpt) for inpt in pairs]
	return

	def scythe_runner(args):

	inputfile,outputfile,adaptorfile=args

	basedir = os.path.split(inputfile)[0]
	adapters = os.path.join(adaptorfile, 'illumina_adapters.fa')
	inbase = os.path.basename(inputfile)
	infile = inputfile

	outpth = os.path.join(outputfile)
	outfile = os.path.join(outpth,inbase)
	outpth = open(os.path.join(outpth, inbase), 'wb')


	statpth = os.path.split(outputfile)[0]
	statpth = os.path.join(statpth, 'stats')

	statpth = open(os.path.join(statpth, inbase.strip(".fastq") + '.txt'), 'w')


	cmd = ['scythe', '-a', adapters, '-q', 'sanger', infile,'-o',outfile]

	proc1 = subprocess.Popen(cmd, stdout = statpth, stderr = statpth)
	output = proc1.communicate()
	outpth.close()
	statpth.close()
	sys.stdout.write(".")
	sys.stdout.flush()

	def trim_adapter_sequences(pool,indirectory,outdirectory,adaptordirectory):

	in_files = [filename for filename in glob.glob(outdirectory+"/fastx-trimmed/*.fastq")]
	os.mkdir(os.path.join(outdirectory, 'adapter-trimmed'))
	out_files=os.path.join(outdirectory,'adapter-trimmed')
	os.mkdir(os.path.join(outdirectory, 'stats'))
	sys.stdout.write("\nTrimming adapter contamination")
	sys.stdout.flush()

	pairs = [item for item in itertools.izip(in_files,itertools.repeat(out_files),itertools.repeat(adaptordirectory))]


	if pool != None:
	pool.map(scythe_runner, pairs)
	else:
	[scythe_runner(pairs) for pairs in in_files]
	return

	def zip_se_files(directory):
	directory = directory.rstrip("/")
	pairs = []

	test= enumerate(glob.glob(directory+"/*.fastq"))

	for count, fin in enumerate(glob.glob(directory+"/*.fastq")):
	if '_R1' in os.path.split(fin)[-1]:
	reads_1 = fin
	reads_2 = fin.replace('_R1', '_R2')
	pairs.append((reads_1, reads_2))

	return pairs

	def sickle_se_runner(inpt):
	# infiles
	r1, r2 = inpt

	# outfiles
	out1 = [pth.replace("adapter-trimmed", "qual-trimmed") for pth in inpt]
	out1 = os.path.splitext(out1)[0] + ".adapter-trimmed.fastq"

	basedir = '/'.join(r1.split("/")[:-2])

	# create output file for stats
	statfile = r1.replace("adapter-trimmed","stats")
	statfile = os.path.splitext(statfile)[0] + ".sickle-trim-stats.txt"
	statfile = open(statfile, 'w')

	# command for sickle (DROPPING ANY Ns)
	cmd = ["sickle", "se", "-f", r1, "-t", "sanger", "-o", out1, "-n"]
	print cmd
	proc1 = subprocess.Popen(cmd, stdout=statfile)
	proc1.communicate()
	statfile.close()
	sys.stdout.write(".")
	sys.stdout.flush()

	def trim_low_qual_reads(pool, outputpath, pe =True):

	sys.stdout.write("\nTrimming low quality reads")
	sys.stdout.flush()

	# create input, output, and stats directories
	outdir = os.path.join(outputpath, "qual-trimmed")
	statdir = os.path.join(outputpath, "stats")
	inpth = os.path.join(outputpath,"adapter-trimmed")

	# check that directories don't exists
	if os.path.exists(outdir) == False:
	os.mkdir(outdir)

	if os.path.exists(statdir) == False:
	os.mkdir(statdir)

	if os.path.exists(inpth) == False:
	os.mkdir(inpth)

	if pe == True:
	in_files = zip_se_files(inpth)

	print in_files

	# map files to sickle runner

	if pool != None:
	pool.map(sickle_se_runner, in_files)
	else:
	[sickle_se_runner(inpt) for inpt in in_files]

	sys.stdout.write("\n\n")
	sys.stdout.flush()
	return

	def main():
	args = get_args()

	# auto configure cores
	if args.cores > 1:
	cores = multiprocessing.cpu_count()
	pool = multiprocessing.Pool(cores)
	else:
	pool = None

	#trim adaptor contamination reads
	#chop_basepairs(pool,args.input,args.output)
	#trim_adapter_sequences(pool,args.input,args.output,args.adaptors)
	trim_low_qual_reads(pool,args.output)

	if __name__ == '__main__':
	main()