Skip to content

Instantly share code, notes, and snippets.

@jasongallant
Created September 13, 2012 18:31
Show Gist options
  • Save jasongallant/3716500 to your computer and use it in GitHub Desktop.
Save jasongallant/3716500 to your computer and use it in GitHub Desktop.
Process Illumina Files (RNASeq)
import os
import sys
import glob
import shutil
import argparse
import subprocess
import multiprocessing
import itertools
class FullPaths(argparse.Action):
"""Expand user- and relative-paths"""
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, os.path.abspath(os.path.expanduser(values)))
def get_args():
parser = argparse.ArgumentParser(description='Pre-process Illumina reads')
parser.add_argument('input', help='The input directory', action=FullPaths)
parser.add_argument('output', help='The output directory', action=FullPaths)
parser.add_argument('adaptors', help='The adaptor directory', action=FullPaths)
parser.add_argument('--cores',
type=int,
default = 1,
help='Number of cores to use.')
return parser.parse_args()
def fastx_trimmer_runner(args):
inpt,output=args
basedir = os.path.split(inpt)[0]
inbase = os.path.basename(inpt)
infile = inpt
outpth = os.path.join(output)
outfile= os.path.join(outpth,inbase)
outpth = open(os.path.join(outpth, inbase), 'wb')
cmd = ['fastx_trimmer', '-Q', '33','-f','9','-i', infile, '-o', outfile]
proc1 = subprocess.Popen(cmd)
output = proc1.communicate()
outpth.close()
sys.stdout.write(".")
sys.stdout.flush()
def chop_basepairs(pool,inputdirectory,outputdirectory):
in_files = [filename for filename in glob.glob(inputdirectory+"/*.fastq")]
os.mkdir(os.path.join(outputdirectory, 'fastx-trimmed'))
out_files=os.path.join(outputdirectory, 'fastx-trimmed')
sys.stdout.write("\nTrimming first 8 basepairs ")
sys.stdout.flush()
pairs = [item for item in itertools.izip_longest(in_files,[],fillvalue=out_files)]
if pool != None:
pool.map(fastx_trimmer_runner, pairs)
else:
[fastx_trimmer_runner(inpt) for inpt in pairs]
return
def scythe_runner(args):
inputfile,outputfile,adaptorfile=args
basedir = os.path.split(inputfile)[0]
adapters = os.path.join(adaptorfile, 'illumina_adapters.fa')
inbase = os.path.basename(inputfile)
infile = inputfile
outpth = os.path.join(outputfile)
outfile = os.path.join(outpth,inbase)
outpth = open(os.path.join(outpth, inbase), 'wb')
statpth = os.path.split(outputfile)[0]
statpth = os.path.join(statpth, 'stats')
statpth = open(os.path.join(statpth, inbase.strip(".fastq") + '.txt'), 'w')
cmd = ['scythe', '-a', adapters, '-q', 'sanger', infile,'-o',outfile]
proc1 = subprocess.Popen(cmd, stdout = statpth, stderr = statpth)
output = proc1.communicate()
outpth.close()
statpth.close()
sys.stdout.write(".")
sys.stdout.flush()
def trim_adapter_sequences(pool,indirectory,outdirectory,adaptordirectory):
in_files = [filename for filename in glob.glob(outdirectory+"/fastx-trimmed/*.fastq")]
os.mkdir(os.path.join(outdirectory, 'adapter-trimmed'))
out_files=os.path.join(outdirectory,'adapter-trimmed')
os.mkdir(os.path.join(outdirectory, 'stats'))
sys.stdout.write("\nTrimming adapter contamination")
sys.stdout.flush()
pairs = [item for item in itertools.izip(in_files,itertools.repeat(out_files),itertools.repeat(adaptordirectory))]
if pool != None:
pool.map(scythe_runner, pairs)
else:
[scythe_runner(pairs) for pairs in in_files]
return
def zip_se_files(directory):
directory = directory.rstrip("/")
pairs = []
test= enumerate(glob.glob(directory+"/*.fastq"))
for count, fin in enumerate(glob.glob(directory+"/*.fastq")):
if '_R1' in os.path.split(fin)[-1]:
reads_1 = fin
reads_2 = fin.replace('_R1', '_R2')
pairs.append((reads_1, reads_2))
return pairs
def sickle_se_runner(inpt):
# infiles
r1, r2 = inpt
# outfiles
out1 = [pth.replace("adapter-trimmed", "qual-trimmed") for pth in inpt]
out1 = os.path.splitext(out1)[0] + ".adapter-trimmed.fastq"
basedir = '/'.join(r1.split("/")[:-2])
# create output file for stats
statfile = r1.replace("adapter-trimmed","stats")
statfile = os.path.splitext(statfile)[0] + ".sickle-trim-stats.txt"
statfile = open(statfile, 'w')
# command for sickle (DROPPING ANY Ns)
cmd = ["sickle", "se", "-f", r1, "-t", "sanger", "-o", out1, "-n"]
print cmd
proc1 = subprocess.Popen(cmd, stdout=statfile)
proc1.communicate()
statfile.close()
sys.stdout.write(".")
sys.stdout.flush()
def trim_low_qual_reads(pool, outputpath, pe =True):
sys.stdout.write("\nTrimming low quality reads")
sys.stdout.flush()
# create input, output, and stats directories
outdir = os.path.join(outputpath, "qual-trimmed")
statdir = os.path.join(outputpath, "stats")
inpth = os.path.join(outputpath,"adapter-trimmed")
# check that directories don't exists
if os.path.exists(outdir) == False:
os.mkdir(outdir)
if os.path.exists(statdir) == False:
os.mkdir(statdir)
if os.path.exists(inpth) == False:
os.mkdir(inpth)
if pe == True:
in_files = zip_se_files(inpth)
print in_files
# map files to sickle runner
if pool != None:
pool.map(sickle_se_runner, in_files)
else:
[sickle_se_runner(inpt) for inpt in in_files]
sys.stdout.write("\n\n")
sys.stdout.flush()
return
def main():
args = get_args()
# auto configure cores
if args.cores > 1:
cores = multiprocessing.cpu_count()
pool = multiprocessing.Pool(cores)
else:
pool = None
#trim adaptor contamination reads
#chop_basepairs(pool,args.input,args.output)
#trim_adapter_sequences(pool,args.input,args.output,args.adaptors)
trim_low_qual_reads(pool,args.output)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment