Created
December 1, 2017 17:48
-
-
Save jasonsahl/5c24196cdceba9155555619c3b76a699 to your computer and use it in GitHub Desktop.
Kallisto read count wrapper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Read counts across a set of reference sequences. | |
Requires Python 2.7 to run""" | |
from __future__ import division | |
from __future__ import print_function | |
from optparse import OptionParser | |
import sys | |
import os | |
import subprocess | |
import glob | |
import re | |
import logging | |
from subprocess import Popen | |
def test_file(option, opt_str, value, parser): | |
try: | |
with open(value): setattr(parser.values, option.dest, value) | |
except IOError: | |
print('%s file cannot be opened' % option) | |
sys.exit() | |
def test_dir(option, opt_str, value, parser): | |
if os.path.exists(value): | |
setattr(parser.values, option.dest, value) | |
else: | |
print("directory of fastqs cannot be found") | |
sys.exit() | |
def get_readFile_components(full_file_path): | |
"""function adapted from: | |
https://github.com/katholt/srst2""" | |
(file_path,file_name) = os.path.split(full_file_path) | |
m1 = re.match("(.*).gz",file_name) | |
ext = "" | |
if m1 != None: | |
ext = ".gz" | |
file_name = m1.groups()[0] | |
(file_name_before_ext,ext2) = os.path.splitext(file_name) | |
full_ext = ext2+ext | |
return(file_path,file_name_before_ext,full_ext) | |
def read_file_sets(dir_path): | |
"""match up pairs of sequence data, adapted from | |
https://github.com/katholt/srst2""" | |
fileSets = {} | |
forward_reads = {} | |
reverse_reads = {} | |
num_paired_readsets = 0 | |
num_single_readsets = 0 | |
for infile in glob.glob(os.path.join(dir_path, "*.fastq.gz")): | |
(file_path,file_name_before_ext,full_ext) = get_readFile_components(infile) | |
m=re.match("(.*)(_S.*)(_L.*)(_R.*)(_.*)", file_name_before_ext) | |
if m is None: | |
m=re.match("(.*)("+"_R1"+")(_.*)$",file_name_before_ext) | |
if m is not None: | |
(baseName,read) = m.groups()[0], m.groups()[1] | |
forward_reads[baseName] = infile | |
else: | |
m=re.match("(.*)("+"_R2"+")(_.*)$",file_name_before_ext) | |
if m is not None: | |
(baseName,read) = m.groups()[0], m.groups()[1] | |
reverse_reads[baseName] = infile | |
else: | |
print("Could not determine forward/reverse read status for input file %s" % infile) | |
else: | |
baseName, read = m.groups()[0], m.groups()[3] | |
if read == "_R1": | |
forward_reads[baseName] = infile | |
elif read == "_R2": | |
reverse_reads[baseName] = infile | |
else: | |
print("Could not determine forward/reverse read status for input file") | |
fileSets[file_name_before_ext] = infile | |
num_single_readsets += 1 | |
for sample in forward_reads: | |
if sample in reverse_reads: | |
fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair | |
num_paired_readsets += 1 | |
else: | |
fileSets[sample] = [forward_reads[sample]] # no reverse found | |
num_single_readsets += 1 | |
logging.info('Warning, could not find pair for read:' + forward_reads[sample]) | |
for sample in reverse_reads: | |
if sample not in fileSets: | |
fileSets[sample] = reverse_reads[sample] # no forward found | |
num_single_readsets += 1 | |
logging.info('Warning, could not find pair for read:' + reverse_reads[sample]) | |
if num_paired_readsets > 0: | |
logging.info('Total paired readsets found:' + str(num_paired_readsets)) | |
if num_single_readsets > 0: | |
logging.info('Total single reads found:' + str(num_single_readsets)) | |
return fileSets | |
def mp_shell(func, params, numProc): | |
from multiprocessing import Pool | |
p = Pool(numProc) | |
out = p.map(func, params) | |
p.terminate() | |
return out | |
def run_kallisto_loop_dev(fileSets,dir_path,reference,processors): | |
names = [] | |
""""idx is the sample name, f is the file dictionary""" | |
files_and_temp_names = [(str(idx), list(f)) for idx, f in fileSets.iteritems()] | |
mp_shell(_perform_workflow_kallisto, files_and_temp_names, processors) | |
for name in files_and_temp_names: | |
names.append(name[0]) | |
return names | |
def _perform_workflow_kallisto(data): | |
sequences = data[1] | |
if len(sequences) == 1: | |
subprocess.check_call("kallisto quant --single --bias -l 100 -s 30 -o %s -i %s %s > /dev/null 2>&1" % (data[0],"REF","".join(sequences)),shell=True) | |
else: | |
subprocess.check_call("kallisto quant --bias -o %s -i %s %s %s > /dev/null 2>&1" % (data[0],"REF",sequences[0],sequences[1]), shell=True) | |
def process_kallisto_dev(names): | |
for name in names: | |
outfile = open("%s.tmp" % name, "w") | |
outfile.write(str(name)+"\n") | |
for line in open("%s/abundance.tsv" % name): | |
newline=line.strip() | |
if line.startswith("target_id"): | |
pass | |
else: | |
fields=newline.split() | |
value = round(float(fields[3])) | |
outfile.write(str(int(value))+"\n") | |
outfile.close() | |
def get_ref_fields(genome): | |
outfile = open("ref.list", "w") | |
outfile.write("target"+"\n") | |
for line in open("%s/abundance.tsv" % genome): | |
if line.startswith("target_id"): | |
pass | |
else: | |
fields=line.split() | |
outfile.write(fields[0]+"\n") | |
outfile.close() | |
def main(read_dir,reference,processors): | |
start_dir = os.getcwd() | |
start_path=os.path.abspath(start_dir) | |
ref_path=os.path.abspath("%s" % reference) | |
dir_path=os.path.abspath("%s" % read_dir) | |
ac=subprocess.call(['which','kallisto']) | |
if ac == 0: | |
pass | |
else: | |
print("kallisto must be in your path") | |
sys.exit() | |
fileSets=read_file_sets(dir_path) | |
subprocess.check_call("kallisto index %s -i REF > /dev/null 2>&1" % ref_path, shell=True) | |
logging.info("Running kallisto") | |
names=run_kallisto_loop_dev(fileSets,dir_path,"REF",processors) | |
logging.info("kallisto finished") | |
process_kallisto_dev(names) | |
get_ref_fields(names[0]) | |
os.system("paste ref.list *.tmp > kallisto_merged_counts.txt") | |
os.system("rm ref.list *.tmp") | |
if __name__ == "__main__": | |
usage="usage: %prog [options]" | |
parser = OptionParser(usage=usage) | |
parser.add_option("-d", "--read_dir", dest="read_dir", | |
help="directory of FASTQ reads [REQUIRED]", | |
action="callback", callback=test_dir, type="string") | |
parser.add_option("-r", "--reference", dest="reference", | |
help="reference genome in FASTA format [REQUIRED]", | |
action="callback", callback=test_file, type="string") | |
parser.add_option("-p", "--processors", dest="processors", | |
help="number of processors to use, defaults to 4", | |
action="store", default="4", type="int") | |
options, args = parser.parse_args() | |
mandatories = ["read_dir","reference"] | |
for m in mandatories: | |
if not options.__dict__[m]: | |
print("\nMust provide %s.\n" %m) | |
parser.print_help() | |
exit(-1) | |
main(options.read_dir,options.reference,options.processors) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment