Last active
December 23, 2015 18:38
-
-
Save jasonsahl/6676663 to your computer and use it in GitHub Desktop.
This script tries to identify portions of a reference genome that have been recombined. The required input is a NASP formatted SNP matrix and the parsimony log from Paup.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import division | |
"""calculates the SNP and homoplash density | |
using a NASP formatted SNP matrix""" | |
import optparse | |
import sys | |
import collections | |
import os | |
def get_index(matrix): | |
"""Return the coordinate and reference chromosome | |
from a NASP matrix""" | |
in_matrix = open(matrix, "U") | |
positions={} | |
firstLine = in_matrix.readline() | |
first_fields = firstLine.split() | |
coords=first_fields.index("Position") | |
chrom=first_fields.index("Contig") | |
for line in in_matrix: | |
fields = line.split("\t") | |
"""this functionality is for multi-fasta reference fiels""" | |
positions.update({fields[chrom]:fields[coords]}) | |
return positions | |
in_matrix.close() | |
def parse_matrix(matrix, step, positions): | |
start=1 | |
in_matrix = open(matrix, "U") | |
outfile = open("out.txt", "w") | |
firstLine = in_matrix.readline() | |
first_fields = firstLine.split("\t") | |
chrom = first_fields.index("Contig") | |
coords=first_fields.index("Position") | |
last=first_fields.index("#SNPcall") | |
for k,v in positions.iteritems(): | |
for i in xrange(int(start), int(v), int(step)): | |
positives=[ ] | |
my_range=range(i,i+step) | |
with open(matrix, "U") as f: | |
next(f) | |
for line in f: | |
fields = line.split() | |
if fields[chrom]==k: | |
if int(fields[coords])>max(my_range): | |
break | |
else: | |
if int(fields[coords]) in my_range: | |
counter=collections.Counter(fields[1:last]) | |
values=counter.values() | |
values.sort(key=int) | |
if len(values)==1: | |
sys.exc_clear() | |
else: | |
for z in range(2,5): | |
if len(values)==int(z) and values[z-2]>1: positives.append("1") | |
else: | |
pass | |
print >> outfile, k, i, i+step, len(positives) | |
in_matrix.close() | |
def parse_paup_log(matrix, paup, ri): | |
entries = [ ] | |
with open(paup, "U") as myFile: | |
for num, line in enumerate(myFile, 1): | |
if "Range" in line: | |
entries.append(num) | |
loi = max(entries) | |
homop = [ ] | |
with open(paup, "U") as f: | |
for line in f.readlines()[loi:]: | |
fields=line.split() | |
try: | |
if float(fields[6])<=float(ri): | |
homop.append(fields[0]) | |
except: | |
pass | |
in_matrix=open(matrix, "U") | |
outfile = open("homoplasies.txt", "w") | |
firstLine = in_matrix.readline() | |
print >> outfile,"homoplasy" | |
for num, line in enumerate(in_matrix, 1): | |
fields=line.split() | |
if str(num) in homop: | |
print >> outfile, fields[0],"\n", | |
else: | |
print >> outfile, "none", "\n", | |
def find_homoplasies(in_matrix, coords_file): | |
matrix = open(in_matrix, "U") | |
firstLine = matrix.readline() | |
first_fields = firstLine.split("\t") | |
coords=first_fields.index("Position") | |
chrom = first_fields.index("Chromosome") | |
outfile = open("SNP_HP_density.txt", "w") | |
matrix.close() | |
for line in open(coords_file, "U"): | |
new_fields = line.split() | |
hits = [ ] | |
with open(in_matrix) as f: | |
next(f) | |
for stuff in f: | |
fields=stuff.split() | |
if "none" not in fields[-1]: | |
if fields[chrom]==new_fields[0]: | |
try: | |
if int(fields[coords])>=int(new_fields[1]) and int(fields[coords])<=int(new_fields[2]): | |
hits.append("1") | |
except: | |
pass | |
if int(len(hits))>=1: | |
try: | |
print >> outfile,"\t".join(new_fields),"\t",len(hits),"\t",float(int(len(hits))/int(new_fields[3])),"\n", | |
except: | |
print >> outfile,"\t".join(new_fields),"\t",len(hits),"\t","0","\n", | |
else: | |
print >> outfile,"\t".join(new_fields),"\t","0","\t","0","\n", | |
def test_file(option, opt_str, value, parser): | |
try: | |
with open(value): setattr(parser.values, option.dest, value) | |
except IOError: | |
print '%s file cannot be opened' % option | |
sys.exit() | |
def main(matrix, step, paup, ri): | |
positions=get_index(matrix) | |
print positions | |
print "calculating SNP density" | |
parse_matrix(matrix, step, positions) | |
print "done" | |
parse_paup_log(matrix, paup, ri) | |
os.system("paste %s homoplasies.txt > new_matrix" % matrix) | |
print "calculating homoplasy density" | |
find_homoplasies("new_matrix", "out.txt") | |
print "done" | |
os.system("rm out.txt homoplasies.txt new_matrix") | |
if __name__ == "__main__": | |
usage="usage: %prog [options]" | |
parser = optparse.OptionParser(usage=usage) | |
parser.add_option("-m", "--matrix", dest="matrix", | |
help="/path/to/NASP formatted SNP matrix [REQUIRED]", | |
type="string", action="callback", callback=test_file) | |
parser.add_option("-s", "--step", dest="step", action="store", | |
help="size for stepping through the genome, defaults to 1000", | |
type="int", default="1000") | |
parser.add_option("-p", "--paup", dest="paup", action="callback", callback=test_file, | |
help="path to paup log file [REQUIRED]", type="string") | |
parser.add_option("-r", "--ri", dest="ri", action="store", | |
help="RI values below this are determined to be homoplastic, defaults to 0.5", | |
type="float", default="0.5") | |
options, args = parser.parse_args() | |
mandatories = ["matrix","paup"] | |
for m in mandatories: | |
if not getattr(options, m, None): | |
print "\nMust provide %s.\n" %m | |
parser.print_help() | |
exit(-1) | |
main(options.matrix, options.step, options.paup, options.ri) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment