Skip to content

Instantly share code, notes, and snippets.

@jasonsahl
Last active June 11, 2019 00:24
Show Gist options
  • Save jasonsahl/9306cd014b63cae12154 to your computer and use it in GitHub Desktop.
Save jasonsahl/9306cd014b63cae12154 to your computer and use it in GitHub Desktop.
count and extract parsimony informative SNPs from a multi-fasta
#!/usr/bin/env python
"""retrieve only parsimony infomative
sites from a nucleotide multiple sequence alignment"""
from optparse import OptionParser
import sys
try:
from Bio import SeqIO
except:
print("biopython is not in your path but needs to be!")
sys.exit()
try:
import collections
except:
print("Need Python >2.7")
sys.exit()
import subprocess
"""convert fasta to tab"""
def fasta_to_tab(fasta):
outfile = open("out.tab", "w")
with open(fasta) as my_fasta:
for record in SeqIO.parse(my_fasta, "fasta"):
outfile.write(str(record.id)+"\t"+str(record.seq)+"\n")
outfile.close()
"""invert columns"""
def invert_tabs(in_tab):
outfile = open("tmp.txt", "w")
fields = []
with open(in_tab) as infile:
for line in infile:
my_fields = line.split()
tmp_fields = []
tmp_fields.append(my_fields[0])
for x in my_fields[1]:
tmp_fields.append(x)
fields.append(tmp_fields)
test=map(list, zip(*fields))
names=[]
for x in test[0]:
names.append(x)
for x in test:
outfile.write("\t".join(x)+"\n")
outfile.close()
"""find singletons"""
def get_singles(in_matrix):
file_out=open("tmp.matrix", "w")
with open(in_matrix) as my_file:
firstLine = my_file.readline()
file_out.write(firstLine)
lines = [ ]
for line in my_file:
fields=line.split()
counter=collections.Counter(fields)
values=counter.values()
values.sort(key=int)
for i in range(2,5):
if len(values)==int(i) and values[int(i-2)]!=1:
file_out.write(line)
lines.append("1")
print("number of parsimony-informative SNPs: %s" % len(lines))
file_out.close()
"""convert back to fasta"""
def matrix_to_fasta(in_matrix):
reduced = [ ]
out_fasta = open("singletons.fasta", "w")
with open(in_matrix) as my_file:
for line in my_file:
fields = line.split()
reduced.append(fields)
test=map(list, zip(*reduced))
for x in test:
out_fasta.write(">"+str(x[0])+"\n")
out_fasta.write("".join(x[1:])+"\n")
out_fasta.close()
def main(fasta):
fasta_to_tab(fasta)
invert_tabs("out.tab")
get_singles("tmp.txt")
matrix_to_fasta("tmp.matrix")
subprocess.check_call("rm out.tab tmp.txt tmp.matrix", shell=True)
if __name__ == "__main__":
usage="usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-f", "--fasta", dest="fasta",
help="/path/to/fasta_file [REQUIRED]",
action="store", type="string")
options, args = parser.parse_args()
mandatories = ["fasta"]
for m in mandatories:
if not options.__dict__[m]:
print("\nMust provide %s.\n" %m)
parser.print_help()
exit(-1)
main(options.fasta)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment