jasonsahl/kallisto_wrapper.py

## kallisto_wrapper.py
#!/usr/bin/env python

"""Read counts across a set of reference sequences.
Requires Python 2.7 to run"""

from __future__ import division
from __future__ import print_function
from optparse import OptionParser
import sys
import os
import subprocess
import glob
import re
import logging
from subprocess import Popen

def test_file(option, opt_str, value, parser):
    try:
        with open(value): setattr(parser.values, option.dest, value)
    except IOError:
        print('%s file cannot be opened' % option)
        sys.exit()

def test_dir(option, opt_str, value, parser):
    if os.path.exists(value):
        setattr(parser.values, option.dest, value)
    else:
        print("directory of fastqs cannot be found")
        sys.exit()

def get_readFile_components(full_file_path):
    """function adapted from:
    https://github.com/katholt/srst2"""
    (file_path,file_name) = os.path.split(full_file_path)
    m1 = re.match("(.*).gz",file_name)
    ext = ""
    if m1 != None:
        ext = ".gz"
        file_name = m1.groups()[0]
    (file_name_before_ext,ext2) = os.path.splitext(file_name)
    full_ext = ext2+ext
    return(file_path,file_name_before_ext,full_ext)

def read_file_sets(dir_path):
    """match up pairs of sequence data, adapted from
    https://github.com/katholt/srst2"""
    fileSets = {}
    forward_reads = {}
    reverse_reads = {}
    num_paired_readsets = 0
    num_single_readsets = 0
    for infile in glob.glob(os.path.join(dir_path, "*.fastq.gz")):
        (file_path,file_name_before_ext,full_ext) = get_readFile_components(infile)
        m=re.match("(.*)(_S.*)(_L.*)(_R.*)(_.*)", file_name_before_ext)
        if m is None:
            m=re.match("(.*)("+"_R1"+")(_.*)$",file_name_before_ext)
            if m is not None:
                (baseName,read) = m.groups()[0], m.groups()[1]
                forward_reads[baseName] = infile
            else:
                m=re.match("(.*)("+"_R2"+")(_.*)$",file_name_before_ext)
                if m is not None:
                    (baseName,read) = m.groups()[0], m.groups()[1]
                    reverse_reads[baseName] = infile
                else:
                    print("Could not determine forward/reverse read status for input file %s" % infile)
        else:
            baseName, read  = m.groups()[0], m.groups()[3]
            if read == "_R1":
                forward_reads[baseName] = infile
            elif read == "_R2":
                reverse_reads[baseName] = infile
            else:
                print("Could not determine forward/reverse read status for input file")
                fileSets[file_name_before_ext] = infile
                num_single_readsets += 1
    for sample in forward_reads:
        if sample in reverse_reads:
            fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair
            num_paired_readsets += 1
        else:
            fileSets[sample] = [forward_reads[sample]] # no reverse found
            num_single_readsets += 1
            logging.info('Warning, could not find pair for read:' + forward_reads[sample])
    for sample in reverse_reads:
        if sample not in fileSets:
            fileSets[sample] = reverse_reads[sample] # no forward found
            num_single_readsets += 1
            logging.info('Warning, could not find pair for read:' + reverse_reads[sample])
    if num_paired_readsets > 0:
        logging.info('Total paired readsets found:' + str(num_paired_readsets))
    if num_single_readsets > 0:
        logging.info('Total single reads found:' + str(num_single_readsets))
    return fileSets

def mp_shell(func, params, numProc):
    from multiprocessing import Pool
    p = Pool(numProc)
    out = p.map(func, params)
    p.terminate()
    return out

def run_kallisto_loop_dev(fileSets,dir_path,reference,processors):
    names = []
    """"idx is the sample name, f is the file dictionary"""
    files_and_temp_names = [(str(idx), list(f)) for idx, f in fileSets.iteritems()]
    mp_shell(_perform_workflow_kallisto, files_and_temp_names, processors)
    for name in files_and_temp_names:
        names.append(name[0])
    return names

def _perform_workflow_kallisto(data):
    sequences = data[1]
    if len(sequences) == 1:
        subprocess.check_call("kallisto quant --single --bias -l 100 -s 30 -o %s -i %s %s > /dev/null 2>&1" % (data[0],"REF","".join(sequences)),shell=True)
    else:
        subprocess.check_call("kallisto quant --bias -o %s -i %s %s %s > /dev/null 2>&1" % (data[0],"REF",sequences[0],sequences[1]), shell=True)

def process_kallisto_dev(names):
    for name in names:
        outfile = open("%s.tmp" % name, "w")
        outfile.write(str(name)+"\n")
        for line in open("%s/abundance.tsv" % name):
            newline=line.strip()
            if line.startswith("target_id"):
                pass
            else:
                fields=newline.split()
                value = round(float(fields[3]))
                outfile.write(str(int(value))+"\n")
        outfile.close()

def get_ref_fields(genome):
    outfile = open("ref.list", "w")
    outfile.write("target"+"\n")
    for line in open("%s/abundance.tsv" % genome):
        if line.startswith("target_id"):
            pass
        else:
            fields=line.split()
            outfile.write(fields[0]+"\n")
    outfile.close()

def main(read_dir,reference,processors):
    start_dir = os.getcwd()
    start_path=os.path.abspath(start_dir)
    ref_path=os.path.abspath("%s" % reference)
    dir_path=os.path.abspath("%s" % read_dir)
    ac=subprocess.call(['which','kallisto'])
    if ac == 0:
        pass
    else:
        print("kallisto must be in your path")
        sys.exit()
    fileSets=read_file_sets(dir_path)
    subprocess.check_call("kallisto index %s -i REF > /dev/null 2>&1" % ref_path, shell=True)
    logging.info("Running kallisto")
    names=run_kallisto_loop_dev(fileSets,dir_path,"REF",processors)
    logging.info("kallisto finished")
    process_kallisto_dev(names)
    get_ref_fields(names[0])
    os.system("paste ref.list *.tmp > kallisto_merged_counts.txt")
    os.system("rm ref.list *.tmp")

if __name__ == "__main__":
    usage="usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-d", "--read_dir", dest="read_dir",
                      help="directory of FASTQ reads [REQUIRED]",
                      action="callback", callback=test_dir, type="string")
    parser.add_option("-r", "--reference", dest="reference",
                      help="reference genome in FASTA format [REQUIRED]",
                      action="callback", callback=test_file, type="string")
    parser.add_option("-p", "--processors", dest="processors",
                      help="number of processors to use, defaults to 4",
                      action="store", default="4", type="int")
    options, args = parser.parse_args()

    mandatories = ["read_dir","reference"]
    for m in mandatories:
        if not options.__dict__[m]:
            print("\nMust provide %s.\n" %m)
            parser.print_help()
            exit(-1)

    main(options.read_dir,options.reference,options.processors)
	#!/usr/bin/env python

	"""Read counts across a set of reference sequences.
	Requires Python 2.7 to run"""

	from __future__ import division
	from __future__ import print_function
	from optparse import OptionParser
	import sys
	import os
	import subprocess
	import glob
	import re
	import logging
	from subprocess import Popen

	def test_file(option, opt_str, value, parser):
	try:
	with open(value): setattr(parser.values, option.dest, value)
	except IOError:
	print('%s file cannot be opened' % option)
	sys.exit()

	def test_dir(option, opt_str, value, parser):
	if os.path.exists(value):
	setattr(parser.values, option.dest, value)
	else:
	print("directory of fastqs cannot be found")
	sys.exit()

	def get_readFile_components(full_file_path):
	"""function adapted from:
	https://github.com/katholt/srst2"""
	(file_path,file_name) = os.path.split(full_file_path)
	m1 = re.match("(.*).gz",file_name)
	ext = ""
	if m1 != None:
	ext = ".gz"
	file_name = m1.groups()[0]
	(file_name_before_ext,ext2) = os.path.splitext(file_name)
	full_ext = ext2+ext
	return(file_path,file_name_before_ext,full_ext)

	def read_file_sets(dir_path):
	"""match up pairs of sequence data, adapted from
	https://github.com/katholt/srst2"""
	fileSets = {}
	forward_reads = {}
	reverse_reads = {}
	num_paired_readsets = 0
	num_single_readsets = 0
	for infile in glob.glob(os.path.join(dir_path, "*.fastq.gz")):
	(file_path,file_name_before_ext,full_ext) = get_readFile_components(infile)
	m=re.match("(.)(_S.)(_L.)(_R.)(_.*)", file_name_before_ext)
	if m is None:
	m=re.match("(.)("+"_R1"+")(_.)$",file_name_before_ext)
	if m is not None:
	(baseName,read) = m.groups()[0], m.groups()[1]
	forward_reads[baseName] = infile
	else:
	m=re.match("(.)("+"_R2"+")(_.)$",file_name_before_ext)
	if m is not None:
	(baseName,read) = m.groups()[0], m.groups()[1]
	reverse_reads[baseName] = infile
	else:
	print("Could not determine forward/reverse read status for input file %s" % infile)
	else:
	baseName, read = m.groups()[0], m.groups()[3]
	if read == "_R1":
	forward_reads[baseName] = infile
	elif read == "_R2":
	reverse_reads[baseName] = infile
	else:
	print("Could not determine forward/reverse read status for input file")
	fileSets[file_name_before_ext] = infile
	num_single_readsets += 1
	for sample in forward_reads:
	if sample in reverse_reads:
	fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair
	num_paired_readsets += 1
	else:
	fileSets[sample] = [forward_reads[sample]] # no reverse found
	num_single_readsets += 1
	logging.info('Warning, could not find pair for read:' + forward_reads[sample])
	for sample in reverse_reads:
	if sample not in fileSets:
	fileSets[sample] = reverse_reads[sample] # no forward found
	num_single_readsets += 1
	logging.info('Warning, could not find pair for read:' + reverse_reads[sample])
	if num_paired_readsets > 0:
	logging.info('Total paired readsets found:' + str(num_paired_readsets))
	if num_single_readsets > 0:
	logging.info('Total single reads found:' + str(num_single_readsets))
	return fileSets

	def mp_shell(func, params, numProc):
	from multiprocessing import Pool
	p = Pool(numProc)
	out = p.map(func, params)
	p.terminate()
	return out

	def run_kallisto_loop_dev(fileSets,dir_path,reference,processors):
	names = []
	""""idx is the sample name, f is the file dictionary"""
	files_and_temp_names = [(str(idx), list(f)) for idx, f in fileSets.iteritems()]
	mp_shell(_perform_workflow_kallisto, files_and_temp_names, processors)
	for name in files_and_temp_names:
	names.append(name[0])
	return names

	def _perform_workflow_kallisto(data):
	sequences = data[1]
	if len(sequences) == 1:
	subprocess.check_call("kallisto quant --single --bias -l 100 -s 30 -o %s -i %s %s > /dev/null 2>&1" % (data[0],"REF","".join(sequences)),shell=True)
	else:
	subprocess.check_call("kallisto quant --bias -o %s -i %s %s %s > /dev/null 2>&1" % (data[0],"REF",sequences[0],sequences[1]), shell=True)

	def process_kallisto_dev(names):
	for name in names:
	outfile = open("%s.tmp" % name, "w")
	outfile.write(str(name)+"\n")
	for line in open("%s/abundance.tsv" % name):
	newline=line.strip()
	if line.startswith("target_id"):
	pass
	else:
	fields=newline.split()
	value = round(float(fields[3]))
	outfile.write(str(int(value))+"\n")
	outfile.close()

	def get_ref_fields(genome):
	outfile = open("ref.list", "w")
	outfile.write("target"+"\n")
	for line in open("%s/abundance.tsv" % genome):
	if line.startswith("target_id"):
	pass
	else:
	fields=line.split()
	outfile.write(fields[0]+"\n")
	outfile.close()

	def main(read_dir,reference,processors):
	start_dir = os.getcwd()
	start_path=os.path.abspath(start_dir)
	ref_path=os.path.abspath("%s" % reference)
	dir_path=os.path.abspath("%s" % read_dir)
	ac=subprocess.call(['which','kallisto'])
	if ac == 0:
	pass
	else:
	print("kallisto must be in your path")
	sys.exit()
	fileSets=read_file_sets(dir_path)
	subprocess.check_call("kallisto index %s -i REF > /dev/null 2>&1" % ref_path, shell=True)
	logging.info("Running kallisto")
	names=run_kallisto_loop_dev(fileSets,dir_path,"REF",processors)
	logging.info("kallisto finished")
	process_kallisto_dev(names)
	get_ref_fields(names[0])
	os.system("paste ref.list *.tmp > kallisto_merged_counts.txt")
	os.system("rm ref.list *.tmp")

	if __name__ == "__main__":
	usage="usage: %prog [options]"
	parser = OptionParser(usage=usage)
	parser.add_option("-d", "--read_dir", dest="read_dir",
	help="directory of FASTQ reads [REQUIRED]",
	action="callback", callback=test_dir, type="string")
	parser.add_option("-r", "--reference", dest="reference",
	help="reference genome in FASTA format [REQUIRED]",
	action="callback", callback=test_file, type="string")
	parser.add_option("-p", "--processors", dest="processors",
	help="number of processors to use, defaults to 4",
	action="store", default="4", type="int")
	options, args = parser.parse_args()

	mandatories = ["read_dir","reference"]
	for m in mandatories:
	if not options.__dict__[m]:
	print("\nMust provide %s.\n" %m)
	parser.print_help()
	exit(-1)

	main(options.read_dir,options.reference,options.processors)