eharkins/amb_rates.py

## amb_rates.py
import re
import sys
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.Seq import Seq

"""
to calculate avg rate of inferred ambiguous nucleotides for many "ancestralStates" raxml-ng output files, do:

1. fd ancestralStates -x python amb_rate.py > amb_rates.txt
   (another way would be to write a for loop - in python or shell script - to run amb_rate.py for each ASR output file)

2. use the below function to get the avg rate from amb_rates.txt:

def get_avg(file):
    with open(file) as fh:
        lines = list(fh)
        rates = [float(l) for l in lines]
        return sum(rates)/len(rates)
"""

def parse_raxmlng_ancestral_state(line):
    asr_seqid, asr_seq = line.strip().split()
    return SeqRecord(
        Seq(asr_seq),
        id=asr_seqid,
        name="",
        description="",
    )

if __name__ == "__main__":
    raxml = True
    data = ""
    if raxml:
        with open(sys.argv[1]) as fh:
            records = [parse_raxmlng_ancestral_state(l) for l in fh]
    else:
        records = SeqIO.parse(sys.argv[1], "fasta")
    for record in records:
        data += str(record.seq)
    amb_count = len(re.findall(r"[^ACGTN\-]", data))
    print(float(amb_count)/len(data))
	import re
	import sys
	from Bio.SeqRecord import SeqRecord
	from Bio import SeqIO
	from Bio.Seq import Seq

	"""
	to calculate avg rate of inferred ambiguous nucleotides for many "ancestralStates" raxml-ng output files, do:

	1. fd ancestralStates -x python amb_rate.py > amb_rates.txt
	(another way would be to write a for loop - in python or shell script - to run amb_rate.py for each ASR output file)

	2. use the below function to get the avg rate from amb_rates.txt:

	def get_avg(file):
	with open(file) as fh:
	lines = list(fh)
	rates = [float(l) for l in lines]
	return sum(rates)/len(rates)
	"""

	def parse_raxmlng_ancestral_state(line):
	asr_seqid, asr_seq = line.strip().split()
	return SeqRecord(
	Seq(asr_seq),
	id=asr_seqid,
	name="",
	description="",
	)

	if __name__ == "__main__":
	raxml = True
	data = ""
	if raxml:
	with open(sys.argv[1]) as fh:
	records = [parse_raxmlng_ancestral_state(l) for l in fh]
	else:
	records = SeqIO.parse(sys.argv[1], "fasta")
	for record in records:
	data += str(record.seq)
	amb_count = len(re.findall(r"[^ACGTN\-]", data))
	print(float(amb_count)/len(data))