Last active
December 15, 2015 17:09
-
-
Save jasonsahl/5294391 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""filter a NASP formatted SNP | |
matrix, to only include a list of genomes. | |
The collections module requires Python 2.7+. | |
This script has not been tested with Python 3""" | |
from optparse import OptionParser | |
from collections import deque | |
import sys | |
import collections | |
def test_file(option, opt_str, value, parser): | |
try: | |
with open(value): setattr(parser.values, option.dest, value) | |
except IOError: | |
print '%s file cannot be opened' % option | |
sys.exit() | |
def filter_genomes(genomes, in_matrix): | |
in_matrix = open(in_matrix, "rU") | |
firstLine = in_matrix.readline() | |
first_fields = firstLine.split() | |
last=first_fields.index("#SNPcall") | |
all_genomes=first_fields[:last] | |
genomes_file = open(genomes, "rU").read().splitlines() | |
to_keep = [ ] | |
for x in all_genomes[1:]: | |
if x not in genomes_file: | |
to_keep.append(all_genomes.index(x)) | |
return to_keep[1:] | |
in_matrix.close() | |
def filter_matrix(to_keep, in_matrix, prefix): | |
matrix = open(in_matrix, "rU") | |
outfile = open("%s_genomes.matrix" % prefix, "w") | |
for line in matrix: | |
fields = line.split() | |
deque((list.pop(fields, i) for i in sorted(to_keep, reverse=True)), maxlen=0) | |
print >> outfile, "\t".join(fields) | |
outfile.close() | |
def filter_monomorphics(in_matrix, prefix): | |
hits = [] | |
matrix = open(in_matrix, "U") | |
outfile = open("%s_matrix_polymorphic.txt" % prefix, "w") | |
firstLine = matrix.readline() | |
first_fields = firstLine.split() | |
last=first_fields.index("#SNPcall") | |
doubles=[] | |
print >> outfile, firstLine, | |
for line in matrix: | |
fields = line.split() | |
for field in fields[1:last]: | |
if len(field)>=2: | |
doubles.append("1") | |
else: | |
pass | |
if len(doubles)>=1: | |
pass | |
else: | |
if "X" not in fields[1:last] and "N" not in fields[1:last] and "." not in fields[1:last] and "n" not in fields[1:last]: | |
counter=collections.Counter(fields[1:last]) | |
values=counter.values() | |
values.sort(key=int) | |
if len(values)>int(1): | |
print >> outfile, line, | |
hits.append("1") | |
else: | |
continue | |
else: | |
continue | |
doubles=[] | |
print "number of polymorphic SNPs = " ,len(hits) | |
def main(in_matrix, prefix, genomes): | |
to_keep=filter_genomes(genomes, in_matrix) | |
filter_matrix(to_keep, in_matrix, prefix) | |
filter_monomorphics("%s_genomes.matrix" % prefix, prefix) | |
if __name__ == "__main__": | |
usage="usage: %prog [options]" | |
parser = OptionParser(usage=usage) | |
parser.add_option("-m", "--in_matrix", dest="in_matrix", | |
help="/path/to/NASP matrix [REQUIRED]", | |
action="callback", callback=test_file, type="string") | |
parser.add_option("-p", "--out_prefix", dest="prefix", | |
help="prefix for renaming output files [REQUIRED]", | |
action="store", type="string") | |
parser.add_option("-g", "--genomes", dest="genomes", | |
help="/path/to/genomes_file [REQUIRED]", | |
action="callback",callback=test_file, type="string") | |
options, args = parser.parse_args() | |
mandatories = ["in_matrix", "prefix", "genomes"] | |
for m in mandatories: | |
if not options.__dict__[m]: | |
print "\nMust provide %s.\n" %m | |
parser.print_help() | |
exit(-1) | |
main(options.in_matrix,options.prefix,options.genomes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment