Created
September 13, 2012 18:31
-
-
Save jasongallant/3716500 to your computer and use it in GitHub Desktop.
Process Illumina Files (RNASeq)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import glob | |
import shutil | |
import argparse | |
import subprocess | |
import multiprocessing | |
import itertools | |
class FullPaths(argparse.Action): | |
"""Expand user- and relative-paths""" | |
def __call__(self, parser, namespace, values, option_string=None): | |
setattr(namespace, self.dest, os.path.abspath(os.path.expanduser(values))) | |
def get_args(): | |
parser = argparse.ArgumentParser(description='Pre-process Illumina reads') | |
parser.add_argument('input', help='The input directory', action=FullPaths) | |
parser.add_argument('output', help='The output directory', action=FullPaths) | |
parser.add_argument('adaptors', help='The adaptor directory', action=FullPaths) | |
parser.add_argument('--cores', | |
type=int, | |
default = 1, | |
help='Number of cores to use.') | |
return parser.parse_args() | |
def fastx_trimmer_runner(args): | |
inpt,output=args | |
basedir = os.path.split(inpt)[0] | |
inbase = os.path.basename(inpt) | |
infile = inpt | |
outpth = os.path.join(output) | |
outfile= os.path.join(outpth,inbase) | |
outpth = open(os.path.join(outpth, inbase), 'wb') | |
cmd = ['fastx_trimmer', '-Q', '33','-f','9','-i', infile, '-o', outfile] | |
proc1 = subprocess.Popen(cmd) | |
output = proc1.communicate() | |
outpth.close() | |
sys.stdout.write(".") | |
sys.stdout.flush() | |
def chop_basepairs(pool,inputdirectory,outputdirectory): | |
in_files = [filename for filename in glob.glob(inputdirectory+"/*.fastq")] | |
os.mkdir(os.path.join(outputdirectory, 'fastx-trimmed')) | |
out_files=os.path.join(outputdirectory, 'fastx-trimmed') | |
sys.stdout.write("\nTrimming first 8 basepairs ") | |
sys.stdout.flush() | |
pairs = [item for item in itertools.izip_longest(in_files,[],fillvalue=out_files)] | |
if pool != None: | |
pool.map(fastx_trimmer_runner, pairs) | |
else: | |
[fastx_trimmer_runner(inpt) for inpt in pairs] | |
return | |
def scythe_runner(args): | |
inputfile,outputfile,adaptorfile=args | |
basedir = os.path.split(inputfile)[0] | |
adapters = os.path.join(adaptorfile, 'illumina_adapters.fa') | |
inbase = os.path.basename(inputfile) | |
infile = inputfile | |
outpth = os.path.join(outputfile) | |
outfile = os.path.join(outpth,inbase) | |
outpth = open(os.path.join(outpth, inbase), 'wb') | |
statpth = os.path.split(outputfile)[0] | |
statpth = os.path.join(statpth, 'stats') | |
statpth = open(os.path.join(statpth, inbase.strip(".fastq") + '.txt'), 'w') | |
cmd = ['scythe', '-a', adapters, '-q', 'sanger', infile,'-o',outfile] | |
proc1 = subprocess.Popen(cmd, stdout = statpth, stderr = statpth) | |
output = proc1.communicate() | |
outpth.close() | |
statpth.close() | |
sys.stdout.write(".") | |
sys.stdout.flush() | |
def trim_adapter_sequences(pool,indirectory,outdirectory,adaptordirectory): | |
in_files = [filename for filename in glob.glob(outdirectory+"/fastx-trimmed/*.fastq")] | |
os.mkdir(os.path.join(outdirectory, 'adapter-trimmed')) | |
out_files=os.path.join(outdirectory,'adapter-trimmed') | |
os.mkdir(os.path.join(outdirectory, 'stats')) | |
sys.stdout.write("\nTrimming adapter contamination") | |
sys.stdout.flush() | |
pairs = [item for item in itertools.izip(in_files,itertools.repeat(out_files),itertools.repeat(adaptordirectory))] | |
if pool != None: | |
pool.map(scythe_runner, pairs) | |
else: | |
[scythe_runner(pairs) for pairs in in_files] | |
return | |
def zip_se_files(directory): | |
directory = directory.rstrip("/") | |
pairs = [] | |
test= enumerate(glob.glob(directory+"/*.fastq")) | |
for count, fin in enumerate(glob.glob(directory+"/*.fastq")): | |
if '_R1' in os.path.split(fin)[-1]: | |
reads_1 = fin | |
reads_2 = fin.replace('_R1', '_R2') | |
pairs.append((reads_1, reads_2)) | |
return pairs | |
def sickle_se_runner(inpt): | |
# infiles | |
r1, r2 = inpt | |
# outfiles | |
out1 = [pth.replace("adapter-trimmed", "qual-trimmed") for pth in inpt] | |
out1 = os.path.splitext(out1)[0] + ".adapter-trimmed.fastq" | |
basedir = '/'.join(r1.split("/")[:-2]) | |
# create output file for stats | |
statfile = r1.replace("adapter-trimmed","stats") | |
statfile = os.path.splitext(statfile)[0] + ".sickle-trim-stats.txt" | |
statfile = open(statfile, 'w') | |
# command for sickle (DROPPING ANY Ns) | |
cmd = ["sickle", "se", "-f", r1, "-t", "sanger", "-o", out1, "-n"] | |
print cmd | |
proc1 = subprocess.Popen(cmd, stdout=statfile) | |
proc1.communicate() | |
statfile.close() | |
sys.stdout.write(".") | |
sys.stdout.flush() | |
def trim_low_qual_reads(pool, outputpath, pe =True): | |
sys.stdout.write("\nTrimming low quality reads") | |
sys.stdout.flush() | |
# create input, output, and stats directories | |
outdir = os.path.join(outputpath, "qual-trimmed") | |
statdir = os.path.join(outputpath, "stats") | |
inpth = os.path.join(outputpath,"adapter-trimmed") | |
# check that directories don't exists | |
if os.path.exists(outdir) == False: | |
os.mkdir(outdir) | |
if os.path.exists(statdir) == False: | |
os.mkdir(statdir) | |
if os.path.exists(inpth) == False: | |
os.mkdir(inpth) | |
if pe == True: | |
in_files = zip_se_files(inpth) | |
print in_files | |
# map files to sickle runner | |
if pool != None: | |
pool.map(sickle_se_runner, in_files) | |
else: | |
[sickle_se_runner(inpt) for inpt in in_files] | |
sys.stdout.write("\n\n") | |
sys.stdout.flush() | |
return | |
def main(): | |
args = get_args() | |
# auto configure cores | |
if args.cores > 1: | |
cores = multiprocessing.cpu_count() | |
pool = multiprocessing.Pool(cores) | |
else: | |
pool = None | |
#trim adaptor contamination reads | |
#chop_basepairs(pool,args.input,args.output) | |
#trim_adapter_sequences(pool,args.input,args.output,args.adaptors) | |
trim_low_qual_reads(pool,args.output) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment