Skip to content

Instantly share code, notes, and snippets.

@jasonsahl
Last active March 5, 2020 18:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jasonsahl/e9516b2d940ad2474ba6e97f5b856440 to your computer and use it in GitHub Desktop.
Save jasonsahl/e9516b2d940ad2474ba6e97f5b856440 to your computer and use it in GitHub Desktop.
Parse Kmer frequencies from a Kmer matrix
#!/usr/bin/env python
"""parse frequencies from a Kmer matrix"""
from __future__ import division
import sys
import os
import optparse
from optparse import OptionParser
from collections import deque
def test_file(option, opt_str, value, parser):
try:
with open(value): setattr(parser.values, option.dest, value)
except IOError:
print('%s file cannot be opened' % option)
sys.exit()
def prune_matrix(matrix,resist,suscept):
"""prune out genomes of interest from a matrix.
Not done efficiently, but appears to work"""
#Group 1 is resistant
group1_ids = []
#Group 2 is susceptible
group2_ids = []
group1_out = open("resist_pruned.txt", "w")
group2_out = open("suscept_pruned.txt", "w")
with open(resist) as file_1:
for line in file_1:
line.strip("\n")
group1_ids.append(line)
with open(suscept) as file_2:
for line in file_2:
line.strip("\n")
group2_ids.append(line)
group1_ids = map(lambda s: s.strip(), group1_ids)
group2_ids = map(lambda s: s.strip(), group2_ids)
first_line = next(open(matrix))
newline = first_line.strip("\n")
fields = newline.split()
original_fields = fields.copy()
group1_idx = []
group2_idx = []
group1_ids_list = list(group1_ids)
group2_ids_list = list(group2_ids)
for x in fields:
"""These are the genomes to prune out"""
if x not in group1_ids_list: group1_idx.append(fields.index(x))
if x not in group2_ids_list: group2_idx.append(fields.index(x))
#This is for group 1
deque((list.pop(fields, i) for i in sorted(group1_idx, reverse=True)), maxlen=0)
fields.insert(0,"Kmer")
group1_out.write("\t".join(fields)+"\n")
#This is for group 2
first_line = next(open(matrix))
#original_fields.insert(0,"cluster")
deque((list.pop(original_fields, i) for i in sorted(group2_idx, reverse=True)), maxlen=0)
original_fields.insert(0,"Kmer")
#original_fields.insert(0,"Kmer")
group2_out.write("\t".join(original_fields)+"\n")
with open(matrix) as in_matrix:
in_matrix.readline()
for line in in_matrix:
newline = line.strip("\n")
fields = newline.split()
#Here I make a copy of these fields to prevent them being altered
fields_copy = fields.copy()
name = fields[0]
deque((list.pop(fields, i) for i in sorted(group1_idx, reverse=True)), maxlen=0)
group1_out.write("".join(name)+"\t"+"\t".join(fields)+"\n")
name = fields_copy[0]
deque((list.pop(fields_copy, i) for i in sorted(group2_idx, reverse=True)), maxlen=0)
group2_out.write("".join(name)+"\t"+"\t".join(fields_copy)+"\n")
group1_out.close()
group2_out.close()
def compare_values(resist,suscept):
resist_values = []
suscept_values = []
kmer_names = []
with open(resist) as resist_file:
resist_file.readline()
for line in resist_file:
newline = line.strip()
fields = newline.split()
ints = list(map(int,fields[1:]))
resist_values.append(sum(ints)/len(ints))
kmer_names.append(fields[0])
with open(suscept) as suscept_file:
suscept_file.readline()
for line in suscept_file:
newline = line.strip()
fields = newline.split()
ints = list(map(int,fields[1:]))
suscept_values.append(sum(ints)/len(ints))
resist_values.insert(0,"resistant")
suscept_values.insert(0,"susceptible")
sstrings = list(map(str,suscept_values))
rstrings = list(map(str,resist_values))
kmer_names.insert(0,"Kmer")
kmer_name_out = open("kmers.tmp", "w")
kmer_name_out.write("\n".join(kmer_names))
kmer_name_out.close()
resist_out = open("resist.tmp", "w")
resist_out.write("\n".join(rstrings))
resist_out.close()
suscept_out = open("suscept.tmp", "w")
suscept_out.write("\n".join(sstrings))
suscept_out.close()
def report_differences(matrix,diff_value):
outfile = open("results_differences.txt", "w")
with open(matrix) as my_matrix:
for line in my_matrix:
if line.startswith("Kmer"):
outfile.write(line)
else:
fields = line.split()
if abs(float(fields[1])-float(fields[2]))>diff_value:
outfile.write(line)
outfile.close()
def main(kmer_matrix,resist_ids,suscept_ids,diff_value):
prune_matrix(kmer_matrix,resist_ids,suscept_ids)
compare_values("resist_pruned.txt","suscept_pruned.txt")
os.system("paste kmers.tmp resist.tmp suscept.tmp > combined.matrix")
report_differences("combined.matrix",diff_value)
os.system("rm kmers.tmp resist.tmp suscept.tmp")
if __name__ == "__main__":
parser = OptionParser(usage="usage: %prog [options]",version="%prog 0.0.0")
parser.add_option("-m", "--kmer_matrix", dest="kmer_matrix",
help="path to Kmer matrix [REQUIRED]",
type="string", action="callback", callback=test_file)
parser.add_option("-r", "--resist_ids", dest="resist_ids",
help="path to file with resistant IDs [REQUIRED]",
type="string", action="callback", callback=test_file)
parser.add_option("-s", "--suscept_ids", dest="suscept_ids",
help="path to file with susceptible IDs [REQUIRED]",
type="string", action="callback", callback=test_file)
parser.add_option("-d", "--diff_value", dest="diff_value",
help="diff value to report, defaults to 0.5",
type="float", action="store", default=0.5)
options, args = parser.parse_args()
mandatories = ["kmer_matrix","resist_ids","suscept_ids"]
for m in mandatories:
if not getattr(options, m, None):
print("\nMust provide %s.\n" %m)
parser.print_help()
exit(-1)
main(options.kmer_matrix,options.resist_ids,options.suscept_ids,options.diff_value)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment