jasonsahl/SNP_HP_density.py

## SNP_HP_density.py
#!/usr/bin/env python

from __future__ import division
"""calculates the SNP and homoplash density
using a NASP formatted SNP matrix"""


import optparse
import sys
import collections
import os


def get_index(matrix):
    """Return the coordinate and reference chromosome
    from a NASP matrix"""
    in_matrix = open(matrix, "U")
    positions={}
    firstLine = in_matrix.readline()
    first_fields = firstLine.split()
    coords=first_fields.index("Position")
    chrom=first_fields.index("Contig")
    for line in in_matrix:
        fields = line.split("\t")
    """this functionality is for multi-fasta reference fiels"""
    positions.update({fields[chrom]:fields[coords]})
    return positions
    in_matrix.close()

def parse_matrix(matrix, step, positions):
    start=1
    in_matrix = open(matrix, "U")
    outfile = open("out.txt", "w")
    firstLine = in_matrix.readline()
    first_fields = firstLine.split("\t")
    chrom = first_fields.index("Contig")
    coords=first_fields.index("Position")
    last=first_fields.index("#SNPcall")
    for k,v in positions.iteritems():
	    for i in xrange(int(start), int(v), int(step)):
		positives=[ ]
		my_range=range(i,i+step)
		with open(matrix, "U") as f:
		     next(f)
		     for line in f:
		         fields = line.split()
			 if fields[chrom]==k:
		             if int(fields[coords])>max(my_range):
				break
			     else:
				if int(fields[coords]) in my_range:
				    counter=collections.Counter(fields[1:last])
				    values=counter.values()
				    values.sort(key=int)
				    if len(values)==1:
				        sys.exc_clear()
				    else:
					for z in range(2,5):
					    if len(values)==int(z) and values[z-2]>1: positives.append("1")
			 else:
			     pass

                print >> outfile, k, i, i+step, len(positives)
    in_matrix.close()

def parse_paup_log(matrix, paup, ri):
    entries = [ ]
    with open(paup, "U") as myFile:
        for num, line in enumerate(myFile, 1):
            if "Range" in line:
                entries.append(num)
    loi = max(entries)
    homop = [ ]
    with open(paup, "U") as f:
        for line in f.readlines()[loi:]:
            fields=line.split()
            try:
                if float(fields[6])<=float(ri):
                    homop.append(fields[0])
            except:
                pass
    in_matrix=open(matrix, "U")
    outfile = open("homoplasies.txt", "w")
    firstLine = in_matrix.readline()
    print >> outfile,"homoplasy"
    for num, line in enumerate(in_matrix, 1):
        fields=line.split()
        if str(num) in homop:
            print >> outfile, fields[0],"\n",
        else:
            print >> outfile, "none", "\n",

def find_homoplasies(in_matrix, coords_file):
    matrix = open(in_matrix, "U")
    firstLine = matrix.readline()
    first_fields = firstLine.split("\t")
    coords=first_fields.index("Position")
    chrom = first_fields.index("Chromosome")
    outfile = open("SNP_HP_density.txt", "w")
    matrix.close()
    for line in open(coords_file, "U"):
        new_fields = line.split()
        hits = [ ]
        with open(in_matrix) as f:
            next(f)
            for stuff in f:
                fields=stuff.split()
                if "none" not in fields[-1]:
                    if fields[chrom]==new_fields[0]:
                        try:
                            if int(fields[coords])>=int(new_fields[1]) and int(fields[coords])<=int(new_fields[2]):
                                hits.append("1")
                        except:
                                pass

        if int(len(hits))>=1:
            try:
                print >> outfile,"\t".join(new_fields),"\t",len(hits),"\t",float(int(len(hits))/int(new_fields[3])),"\n",
            except:
                print >> outfile,"\t".join(new_fields),"\t",len(hits),"\t","0","\n",
        else:
            print >> outfile,"\t".join(new_fields),"\t","0","\t","0","\n",

def test_file(option, opt_str, value, parser):
    try:
        with open(value): setattr(parser.values, option.dest, value)
    except IOError:
        print '%s file cannot be opened' % option
        sys.exit()

def main(matrix, step, paup, ri):
    positions=get_index(matrix)
    print positions
    print "calculating SNP density"
    parse_matrix(matrix, step, positions)
    print "done"
    parse_paup_log(matrix, paup, ri)
    os.system("paste %s homoplasies.txt > new_matrix" % matrix)
    print "calculating homoplasy density"
    find_homoplasies("new_matrix", "out.txt")
    print "done"
    os.system("rm out.txt homoplasies.txt new_matrix")

if __name__ == "__main__":
    usage="usage: %prog [options]"
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("-m", "--matrix", dest="matrix",
                      help="/path/to/NASP formatted SNP matrix [REQUIRED]",
                      type="string", action="callback", callback=test_file)
    parser.add_option("-s", "--step", dest="step", action="store",
                      help="size for stepping through the genome, defaults to 1000",
                      type="int", default="1000")
    parser.add_option("-p", "--paup", dest="paup", action="callback", callback=test_file,
                      help="path to paup log file [REQUIRED]", type="string")
    parser.add_option("-r", "--ri", dest="ri", action="store",
                      help="RI values below this are determined to be homoplastic, defaults to 0.5",
                      type="float", default="0.5")
    options, args = parser.parse_args()

    mandatories = ["matrix","paup"]
    for m in mandatories:
        if not getattr(options, m, None):
            print "\nMust provide %s.\n" %m
            parser.print_help()
            exit(-1)

    main(options.matrix, options.step, options.paup, options.ri)
	#!/usr/bin/env python

	from __future__ import division
	"""calculates the SNP and homoplash density
	using a NASP formatted SNP matrix"""


	import optparse
	import sys
	import collections
	import os


	def get_index(matrix):
	"""Return the coordinate and reference chromosome
	from a NASP matrix"""
	in_matrix = open(matrix, "U")
	positions={}
	firstLine = in_matrix.readline()
	first_fields = firstLine.split()
	coords=first_fields.index("Position")
	chrom=first_fields.index("Contig")
	for line in in_matrix:
	fields = line.split("\t")
	"""this functionality is for multi-fasta reference fiels"""
	positions.update({fields[chrom]:fields[coords]})
	return positions
	in_matrix.close()

	def parse_matrix(matrix, step, positions):
	start=1
	in_matrix = open(matrix, "U")
	outfile = open("out.txt", "w")
	firstLine = in_matrix.readline()
	first_fields = firstLine.split("\t")
	chrom = first_fields.index("Contig")
	coords=first_fields.index("Position")
	last=first_fields.index("#SNPcall")
	for k,v in positions.iteritems():
	for i in xrange(int(start), int(v), int(step)):
	positives=[ ]
	my_range=range(i,i+step)
	with open(matrix, "U") as f:
	next(f)
	for line in f:
	fields = line.split()
	if fields[chrom]==k:
	if int(fields[coords])>max(my_range):
	break
	else:
	if int(fields[coords]) in my_range:
	counter=collections.Counter(fields[1:last])
	values=counter.values()
	values.sort(key=int)
	if len(values)==1:
	sys.exc_clear()
	else:
	for z in range(2,5):
	if len(values)==int(z) and values[z-2]>1: positives.append("1")
	else:
	pass

	print >> outfile, k, i, i+step, len(positives)
	in_matrix.close()

	def parse_paup_log(matrix, paup, ri):
	entries = [ ]
	with open(paup, "U") as myFile:
	for num, line in enumerate(myFile, 1):
	if "Range" in line:
	entries.append(num)
	loi = max(entries)
	homop = [ ]
	with open(paup, "U") as f:
	for line in f.readlines()[loi:]:
	fields=line.split()
	try:
	if float(fields[6])<=float(ri):
	homop.append(fields[0])
	except:
	pass
	in_matrix=open(matrix, "U")
	outfile = open("homoplasies.txt", "w")
	firstLine = in_matrix.readline()
	print >> outfile,"homoplasy"
	for num, line in enumerate(in_matrix, 1):
	fields=line.split()
	if str(num) in homop:
	print >> outfile, fields[0],"\n",
	else:
	print >> outfile, "none", "\n",

	def find_homoplasies(in_matrix, coords_file):
	matrix = open(in_matrix, "U")
	firstLine = matrix.readline()
	first_fields = firstLine.split("\t")
	coords=first_fields.index("Position")
	chrom = first_fields.index("Chromosome")
	outfile = open("SNP_HP_density.txt", "w")
	matrix.close()
	for line in open(coords_file, "U"):
	new_fields = line.split()
	hits = [ ]
	with open(in_matrix) as f:
	next(f)
	for stuff in f:
	fields=stuff.split()
	if "none" not in fields[-1]:
	if fields[chrom]==new_fields[0]:
	try:
	if int(fields[coords])>=int(new_fields[1]) and int(fields[coords])<=int(new_fields[2]):
	hits.append("1")
	except:
	pass

	if int(len(hits))>=1:
	try:
	print >> outfile,"\t".join(new_fields),"\t",len(hits),"\t",float(int(len(hits))/int(new_fields[3])),"\n",
	except:
	print >> outfile,"\t".join(new_fields),"\t",len(hits),"\t","0","\n",
	else:
	print >> outfile,"\t".join(new_fields),"\t","0","\t","0","\n",

	def test_file(option, opt_str, value, parser):
	try:
	with open(value): setattr(parser.values, option.dest, value)
	except IOError:
	print '%s file cannot be opened' % option
	sys.exit()

	def main(matrix, step, paup, ri):
	positions=get_index(matrix)
	print positions
	print "calculating SNP density"
	parse_matrix(matrix, step, positions)
	print "done"
	parse_paup_log(matrix, paup, ri)
	os.system("paste %s homoplasies.txt > new_matrix" % matrix)
	print "calculating homoplasy density"
	find_homoplasies("new_matrix", "out.txt")
	print "done"
	os.system("rm out.txt homoplasies.txt new_matrix")

	if __name__ == "__main__":
	usage="usage: %prog [options]"
	parser = optparse.OptionParser(usage=usage)
	parser.add_option("-m", "--matrix", dest="matrix",
	help="/path/to/NASP formatted SNP matrix [REQUIRED]",
	type="string", action="callback", callback=test_file)
	parser.add_option("-s", "--step", dest="step", action="store",
	help="size for stepping through the genome, defaults to 1000",
	type="int", default="1000")
	parser.add_option("-p", "--paup", dest="paup", action="callback", callback=test_file,
	help="path to paup log file [REQUIRED]", type="string")
	parser.add_option("-r", "--ri", dest="ri", action="store",
	help="RI values below this are determined to be homoplastic, defaults to 0.5",
	type="float", default="0.5")
	options, args = parser.parse_args()

	mandatories = ["matrix","paup"]
	for m in mandatories:
	if not getattr(options, m, None):
	print "\nMust provide %s.\n" %m
	parser.print_help()
	exit(-1)

	main(options.matrix, options.step, options.paup, options.ri)