Skip to content

Instantly share code, notes, and snippets.

@jasonsahl
Last active December 23, 2015 18:38
Show Gist options
  • Save jasonsahl/6676663 to your computer and use it in GitHub Desktop.
Save jasonsahl/6676663 to your computer and use it in GitHub Desktop.
This script tries to identify portions of a reference genome that have been recombined. The required input is a NASP formatted SNP matrix and the parsimony log from Paup.
#!/usr/bin/env python
from __future__ import division
"""calculates the SNP and homoplash density
using a NASP formatted SNP matrix"""
import optparse
import sys
import collections
import os
def get_index(matrix):
"""Return the coordinate and reference chromosome
from a NASP matrix"""
in_matrix = open(matrix, "U")
positions={}
firstLine = in_matrix.readline()
first_fields = firstLine.split()
coords=first_fields.index("Position")
chrom=first_fields.index("Contig")
for line in in_matrix:
fields = line.split("\t")
"""this functionality is for multi-fasta reference fiels"""
positions.update({fields[chrom]:fields[coords]})
return positions
in_matrix.close()
def parse_matrix(matrix, step, positions):
start=1
in_matrix = open(matrix, "U")
outfile = open("out.txt", "w")
firstLine = in_matrix.readline()
first_fields = firstLine.split("\t")
chrom = first_fields.index("Contig")
coords=first_fields.index("Position")
last=first_fields.index("#SNPcall")
for k,v in positions.iteritems():
for i in xrange(int(start), int(v), int(step)):
positives=[ ]
my_range=range(i,i+step)
with open(matrix, "U") as f:
next(f)
for line in f:
fields = line.split()
if fields[chrom]==k:
if int(fields[coords])>max(my_range):
break
else:
if int(fields[coords]) in my_range:
counter=collections.Counter(fields[1:last])
values=counter.values()
values.sort(key=int)
if len(values)==1:
sys.exc_clear()
else:
for z in range(2,5):
if len(values)==int(z) and values[z-2]>1: positives.append("1")
else:
pass
print >> outfile, k, i, i+step, len(positives)
in_matrix.close()
def parse_paup_log(matrix, paup, ri):
entries = [ ]
with open(paup, "U") as myFile:
for num, line in enumerate(myFile, 1):
if "Range" in line:
entries.append(num)
loi = max(entries)
homop = [ ]
with open(paup, "U") as f:
for line in f.readlines()[loi:]:
fields=line.split()
try:
if float(fields[6])<=float(ri):
homop.append(fields[0])
except:
pass
in_matrix=open(matrix, "U")
outfile = open("homoplasies.txt", "w")
firstLine = in_matrix.readline()
print >> outfile,"homoplasy"
for num, line in enumerate(in_matrix, 1):
fields=line.split()
if str(num) in homop:
print >> outfile, fields[0],"\n",
else:
print >> outfile, "none", "\n",
def find_homoplasies(in_matrix, coords_file):
matrix = open(in_matrix, "U")
firstLine = matrix.readline()
first_fields = firstLine.split("\t")
coords=first_fields.index("Position")
chrom = first_fields.index("Chromosome")
outfile = open("SNP_HP_density.txt", "w")
matrix.close()
for line in open(coords_file, "U"):
new_fields = line.split()
hits = [ ]
with open(in_matrix) as f:
next(f)
for stuff in f:
fields=stuff.split()
if "none" not in fields[-1]:
if fields[chrom]==new_fields[0]:
try:
if int(fields[coords])>=int(new_fields[1]) and int(fields[coords])<=int(new_fields[2]):
hits.append("1")
except:
pass
if int(len(hits))>=1:
try:
print >> outfile,"\t".join(new_fields),"\t",len(hits),"\t",float(int(len(hits))/int(new_fields[3])),"\n",
except:
print >> outfile,"\t".join(new_fields),"\t",len(hits),"\t","0","\n",
else:
print >> outfile,"\t".join(new_fields),"\t","0","\t","0","\n",
def test_file(option, opt_str, value, parser):
try:
with open(value): setattr(parser.values, option.dest, value)
except IOError:
print '%s file cannot be opened' % option
sys.exit()
def main(matrix, step, paup, ri):
positions=get_index(matrix)
print positions
print "calculating SNP density"
parse_matrix(matrix, step, positions)
print "done"
parse_paup_log(matrix, paup, ri)
os.system("paste %s homoplasies.txt > new_matrix" % matrix)
print "calculating homoplasy density"
find_homoplasies("new_matrix", "out.txt")
print "done"
os.system("rm out.txt homoplasies.txt new_matrix")
if __name__ == "__main__":
usage="usage: %prog [options]"
parser = optparse.OptionParser(usage=usage)
parser.add_option("-m", "--matrix", dest="matrix",
help="/path/to/NASP formatted SNP matrix [REQUIRED]",
type="string", action="callback", callback=test_file)
parser.add_option("-s", "--step", dest="step", action="store",
help="size for stepping through the genome, defaults to 1000",
type="int", default="1000")
parser.add_option("-p", "--paup", dest="paup", action="callback", callback=test_file,
help="path to paup log file [REQUIRED]", type="string")
parser.add_option("-r", "--ri", dest="ri", action="store",
help="RI values below this are determined to be homoplastic, defaults to 0.5",
type="float", default="0.5")
options, args = parser.parse_args()
mandatories = ["matrix","paup"]
for m in mandatories:
if not getattr(options, m, None):
print "\nMust provide %s.\n" %m
parser.print_help()
exit(-1)
main(options.matrix, options.step, options.paup, options.ri)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment