Skip to content

Instantly share code, notes, and snippets.

@jasonsahl
Created December 1, 2017 17:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jasonsahl/5c24196cdceba9155555619c3b76a699 to your computer and use it in GitHub Desktop.
Save jasonsahl/5c24196cdceba9155555619c3b76a699 to your computer and use it in GitHub Desktop.
Kallisto read count wrapper
#!/usr/bin/env python
"""Read counts across a set of reference sequences.
Requires Python 2.7 to run"""
from __future__ import division
from __future__ import print_function
from optparse import OptionParser
import sys
import os
import subprocess
import glob
import re
import logging
from subprocess import Popen
def test_file(option, opt_str, value, parser):
try:
with open(value): setattr(parser.values, option.dest, value)
except IOError:
print('%s file cannot be opened' % option)
sys.exit()
def test_dir(option, opt_str, value, parser):
if os.path.exists(value):
setattr(parser.values, option.dest, value)
else:
print("directory of fastqs cannot be found")
sys.exit()
def get_readFile_components(full_file_path):
"""function adapted from:
https://github.com/katholt/srst2"""
(file_path,file_name) = os.path.split(full_file_path)
m1 = re.match("(.*).gz",file_name)
ext = ""
if m1 != None:
ext = ".gz"
file_name = m1.groups()[0]
(file_name_before_ext,ext2) = os.path.splitext(file_name)
full_ext = ext2+ext
return(file_path,file_name_before_ext,full_ext)
def read_file_sets(dir_path):
"""match up pairs of sequence data, adapted from
https://github.com/katholt/srst2"""
fileSets = {}
forward_reads = {}
reverse_reads = {}
num_paired_readsets = 0
num_single_readsets = 0
for infile in glob.glob(os.path.join(dir_path, "*.fastq.gz")):
(file_path,file_name_before_ext,full_ext) = get_readFile_components(infile)
m=re.match("(.*)(_S.*)(_L.*)(_R.*)(_.*)", file_name_before_ext)
if m is None:
m=re.match("(.*)("+"_R1"+")(_.*)$",file_name_before_ext)
if m is not None:
(baseName,read) = m.groups()[0], m.groups()[1]
forward_reads[baseName] = infile
else:
m=re.match("(.*)("+"_R2"+")(_.*)$",file_name_before_ext)
if m is not None:
(baseName,read) = m.groups()[0], m.groups()[1]
reverse_reads[baseName] = infile
else:
print("Could not determine forward/reverse read status for input file %s" % infile)
else:
baseName, read = m.groups()[0], m.groups()[3]
if read == "_R1":
forward_reads[baseName] = infile
elif read == "_R2":
reverse_reads[baseName] = infile
else:
print("Could not determine forward/reverse read status for input file")
fileSets[file_name_before_ext] = infile
num_single_readsets += 1
for sample in forward_reads:
if sample in reverse_reads:
fileSets[sample] = [forward_reads[sample],reverse_reads[sample]] # store pair
num_paired_readsets += 1
else:
fileSets[sample] = [forward_reads[sample]] # no reverse found
num_single_readsets += 1
logging.info('Warning, could not find pair for read:' + forward_reads[sample])
for sample in reverse_reads:
if sample not in fileSets:
fileSets[sample] = reverse_reads[sample] # no forward found
num_single_readsets += 1
logging.info('Warning, could not find pair for read:' + reverse_reads[sample])
if num_paired_readsets > 0:
logging.info('Total paired readsets found:' + str(num_paired_readsets))
if num_single_readsets > 0:
logging.info('Total single reads found:' + str(num_single_readsets))
return fileSets
def mp_shell(func, params, numProc):
from multiprocessing import Pool
p = Pool(numProc)
out = p.map(func, params)
p.terminate()
return out
def run_kallisto_loop_dev(fileSets,dir_path,reference,processors):
names = []
""""idx is the sample name, f is the file dictionary"""
files_and_temp_names = [(str(idx), list(f)) for idx, f in fileSets.iteritems()]
mp_shell(_perform_workflow_kallisto, files_and_temp_names, processors)
for name in files_and_temp_names:
names.append(name[0])
return names
def _perform_workflow_kallisto(data):
sequences = data[1]
if len(sequences) == 1:
subprocess.check_call("kallisto quant --single --bias -l 100 -s 30 -o %s -i %s %s > /dev/null 2>&1" % (data[0],"REF","".join(sequences)),shell=True)
else:
subprocess.check_call("kallisto quant --bias -o %s -i %s %s %s > /dev/null 2>&1" % (data[0],"REF",sequences[0],sequences[1]), shell=True)
def process_kallisto_dev(names):
for name in names:
outfile = open("%s.tmp" % name, "w")
outfile.write(str(name)+"\n")
for line in open("%s/abundance.tsv" % name):
newline=line.strip()
if line.startswith("target_id"):
pass
else:
fields=newline.split()
value = round(float(fields[3]))
outfile.write(str(int(value))+"\n")
outfile.close()
def get_ref_fields(genome):
outfile = open("ref.list", "w")
outfile.write("target"+"\n")
for line in open("%s/abundance.tsv" % genome):
if line.startswith("target_id"):
pass
else:
fields=line.split()
outfile.write(fields[0]+"\n")
outfile.close()
def main(read_dir,reference,processors):
start_dir = os.getcwd()
start_path=os.path.abspath(start_dir)
ref_path=os.path.abspath("%s" % reference)
dir_path=os.path.abspath("%s" % read_dir)
ac=subprocess.call(['which','kallisto'])
if ac == 0:
pass
else:
print("kallisto must be in your path")
sys.exit()
fileSets=read_file_sets(dir_path)
subprocess.check_call("kallisto index %s -i REF > /dev/null 2>&1" % ref_path, shell=True)
logging.info("Running kallisto")
names=run_kallisto_loop_dev(fileSets,dir_path,"REF",processors)
logging.info("kallisto finished")
process_kallisto_dev(names)
get_ref_fields(names[0])
os.system("paste ref.list *.tmp > kallisto_merged_counts.txt")
os.system("rm ref.list *.tmp")
if __name__ == "__main__":
usage="usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-d", "--read_dir", dest="read_dir",
help="directory of FASTQ reads [REQUIRED]",
action="callback", callback=test_dir, type="string")
parser.add_option("-r", "--reference", dest="reference",
help="reference genome in FASTA format [REQUIRED]",
action="callback", callback=test_file, type="string")
parser.add_option("-p", "--processors", dest="processors",
help="number of processors to use, defaults to 4",
action="store", default="4", type="int")
options, args = parser.parse_args()
mandatories = ["read_dir","reference"]
for m in mandatories:
if not options.__dict__[m]:
print("\nMust provide %s.\n" %m)
parser.print_help()
exit(-1)
main(options.read_dir,options.reference,options.processors)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment