Skip to content

Instantly share code, notes, and snippets.

@eharkins
Created February 24, 2020 18:48
Show Gist options
  • Save eharkins/749ec474478d108da984a50a9615e6b4 to your computer and use it in GitHub Desktop.
Save eharkins/749ec474478d108da984a50a9615e6b4 to your computer and use it in GitHub Desktop.
import re
import sys
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.Seq import Seq
"""
to calculate avg rate of inferred ambiguous nucleotides for many "ancestralStates" raxml-ng output files, do:
1. fd ancestralStates -x python amb_rate.py > amb_rates.txt
(another way would be to write a for loop - in python or shell script - to run amb_rate.py for each ASR output file)
2. use the below function to get the avg rate from amb_rates.txt:
def get_avg(file):
with open(file) as fh:
lines = list(fh)
rates = [float(l) for l in lines]
return sum(rates)/len(rates)
"""
def parse_raxmlng_ancestral_state(line):
asr_seqid, asr_seq = line.strip().split()
return SeqRecord(
Seq(asr_seq),
id=asr_seqid,
name="",
description="",
)
if __name__ == "__main__":
raxml = True
data = ""
if raxml:
with open(sys.argv[1]) as fh:
records = [parse_raxmlng_ancestral_state(l) for l in fh]
else:
records = SeqIO.parse(sys.argv[1], "fasta")
for record in records:
data += str(record.seq)
amb_count = len(re.findall(r"[^ACGTN\-]", data))
print(float(amb_count)/len(data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment