Created
February 24, 2020 18:48
-
-
Save eharkins/749ec474478d108da984a50a9615e6b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
from Bio.SeqRecord import SeqRecord | |
from Bio import SeqIO | |
from Bio.Seq import Seq | |
""" | |
to calculate avg rate of inferred ambiguous nucleotides for many "ancestralStates" raxml-ng output files, do: | |
1. fd ancestralStates -x python amb_rate.py > amb_rates.txt | |
(another way would be to write a for loop - in python or shell script - to run amb_rate.py for each ASR output file) | |
2. use the below function to get the avg rate from amb_rates.txt: | |
def get_avg(file): | |
with open(file) as fh: | |
lines = list(fh) | |
rates = [float(l) for l in lines] | |
return sum(rates)/len(rates) | |
""" | |
def parse_raxmlng_ancestral_state(line): | |
asr_seqid, asr_seq = line.strip().split() | |
return SeqRecord( | |
Seq(asr_seq), | |
id=asr_seqid, | |
name="", | |
description="", | |
) | |
if __name__ == "__main__": | |
raxml = True | |
data = "" | |
if raxml: | |
with open(sys.argv[1]) as fh: | |
records = [parse_raxmlng_ancestral_state(l) for l in fh] | |
else: | |
records = SeqIO.parse(sys.argv[1], "fasta") | |
for record in records: | |
data += str(record.seq) | |
amb_count = len(re.findall(r"[^ACGTN\-]", data)) | |
print(float(amb_count)/len(data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment