Skip to content

Instantly share code, notes, and snippets.

@jasonsahl
Last active December 15, 2015 17:09
Show Gist options
  • Save jasonsahl/5294391 to your computer and use it in GitHub Desktop.
Save jasonsahl/5294391 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""filter a NASP formatted SNP
matrix, to only include a list of genomes.
The collections module requires Python 2.7+.
This script has not been tested with Python 3"""
from optparse import OptionParser
from collections import deque
import sys
import collections
def test_file(option, opt_str, value, parser):
try:
with open(value): setattr(parser.values, option.dest, value)
except IOError:
print '%s file cannot be opened' % option
sys.exit()
def filter_genomes(genomes, in_matrix):
in_matrix = open(in_matrix, "rU")
firstLine = in_matrix.readline()
first_fields = firstLine.split()
last=first_fields.index("#SNPcall")
all_genomes=first_fields[:last]
genomes_file = open(genomes, "rU").read().splitlines()
to_keep = [ ]
for x in all_genomes[1:]:
if x not in genomes_file:
to_keep.append(all_genomes.index(x))
return to_keep[1:]
in_matrix.close()
def filter_matrix(to_keep, in_matrix, prefix):
matrix = open(in_matrix, "rU")
outfile = open("%s_genomes.matrix" % prefix, "w")
for line in matrix:
fields = line.split()
deque((list.pop(fields, i) for i in sorted(to_keep, reverse=True)), maxlen=0)
print >> outfile, "\t".join(fields)
outfile.close()
def filter_monomorphics(in_matrix, prefix):
hits = []
matrix = open(in_matrix, "U")
outfile = open("%s_matrix_polymorphic.txt" % prefix, "w")
firstLine = matrix.readline()
first_fields = firstLine.split()
last=first_fields.index("#SNPcall")
doubles=[]
print >> outfile, firstLine,
for line in matrix:
fields = line.split()
for field in fields[1:last]:
if len(field)>=2:
doubles.append("1")
else:
pass
if len(doubles)>=1:
pass
else:
if "X" not in fields[1:last] and "N" not in fields[1:last] and "." not in fields[1:last] and "n" not in fields[1:last]:
counter=collections.Counter(fields[1:last])
values=counter.values()
values.sort(key=int)
if len(values)>int(1):
print >> outfile, line,
hits.append("1")
else:
continue
else:
continue
doubles=[]
print "number of polymorphic SNPs = " ,len(hits)
def main(in_matrix, prefix, genomes):
to_keep=filter_genomes(genomes, in_matrix)
filter_matrix(to_keep, in_matrix, prefix)
filter_monomorphics("%s_genomes.matrix" % prefix, prefix)
if __name__ == "__main__":
usage="usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-m", "--in_matrix", dest="in_matrix",
help="/path/to/NASP matrix [REQUIRED]",
action="callback", callback=test_file, type="string")
parser.add_option("-p", "--out_prefix", dest="prefix",
help="prefix for renaming output files [REQUIRED]",
action="store", type="string")
parser.add_option("-g", "--genomes", dest="genomes",
help="/path/to/genomes_file [REQUIRED]",
action="callback",callback=test_file, type="string")
options, args = parser.parse_args()
mandatories = ["in_matrix", "prefix", "genomes"]
for m in mandatories:
if not options.__dict__[m]:
print "\nMust provide %s.\n" %m
parser.print_help()
exit(-1)
main(options.in_matrix,options.prefix,options.genomes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment