Last active
June 11, 2019 00:24
-
-
Save jasonsahl/9306cd014b63cae12154 to your computer and use it in GitHub Desktop.
count and extract parsimony informative SNPs from a multi-fasta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""retrieve only parsimony infomative | |
sites from a nucleotide multiple sequence alignment""" | |
from optparse import OptionParser | |
import sys | |
try: | |
from Bio import SeqIO | |
except: | |
print("biopython is not in your path but needs to be!") | |
sys.exit() | |
try: | |
import collections | |
except: | |
print("Need Python >2.7") | |
sys.exit() | |
import subprocess | |
"""convert fasta to tab""" | |
def fasta_to_tab(fasta): | |
outfile = open("out.tab", "w") | |
with open(fasta) as my_fasta: | |
for record in SeqIO.parse(my_fasta, "fasta"): | |
outfile.write(str(record.id)+"\t"+str(record.seq)+"\n") | |
outfile.close() | |
"""invert columns""" | |
def invert_tabs(in_tab): | |
outfile = open("tmp.txt", "w") | |
fields = [] | |
with open(in_tab) as infile: | |
for line in infile: | |
my_fields = line.split() | |
tmp_fields = [] | |
tmp_fields.append(my_fields[0]) | |
for x in my_fields[1]: | |
tmp_fields.append(x) | |
fields.append(tmp_fields) | |
test=map(list, zip(*fields)) | |
names=[] | |
for x in test[0]: | |
names.append(x) | |
for x in test: | |
outfile.write("\t".join(x)+"\n") | |
outfile.close() | |
"""find singletons""" | |
def get_singles(in_matrix): | |
file_out=open("tmp.matrix", "w") | |
with open(in_matrix) as my_file: | |
firstLine = my_file.readline() | |
file_out.write(firstLine) | |
lines = [ ] | |
for line in my_file: | |
fields=line.split() | |
counter=collections.Counter(fields) | |
values=counter.values() | |
values.sort(key=int) | |
for i in range(2,5): | |
if len(values)==int(i) and values[int(i-2)]!=1: | |
file_out.write(line) | |
lines.append("1") | |
print("number of parsimony-informative SNPs: %s" % len(lines)) | |
file_out.close() | |
"""convert back to fasta""" | |
def matrix_to_fasta(in_matrix): | |
reduced = [ ] | |
out_fasta = open("singletons.fasta", "w") | |
with open(in_matrix) as my_file: | |
for line in my_file: | |
fields = line.split() | |
reduced.append(fields) | |
test=map(list, zip(*reduced)) | |
for x in test: | |
out_fasta.write(">"+str(x[0])+"\n") | |
out_fasta.write("".join(x[1:])+"\n") | |
out_fasta.close() | |
def main(fasta): | |
fasta_to_tab(fasta) | |
invert_tabs("out.tab") | |
get_singles("tmp.txt") | |
matrix_to_fasta("tmp.matrix") | |
subprocess.check_call("rm out.tab tmp.txt tmp.matrix", shell=True) | |
if __name__ == "__main__": | |
usage="usage: %prog [options]" | |
parser = OptionParser(usage=usage) | |
parser.add_option("-f", "--fasta", dest="fasta", | |
help="/path/to/fasta_file [REQUIRED]", | |
action="store", type="string") | |
options, args = parser.parse_args() | |
mandatories = ["fasta"] | |
for m in mandatories: | |
if not options.__dict__[m]: | |
print("\nMust provide %s.\n" %m) | |
parser.print_help() | |
exit(-1) | |
main(options.fasta) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment