Skip to content

Instantly share code, notes, and snippets.

@bredelings
Created August 13, 2020 16:15
Show Gist options
  • Save bredelings/4888f41efc169805a92d42f70767600b to your computer and use it in GitHub Desktop.
Save bredelings/4888f41efc169805a92d42f70767600b to your computer and use it in GitHub Desktop.
How to parse Newick in python using the parsy library
#!/usr/bin/python3
import sys
import re
from parsy import regex, string, success, generate
def newick_quote(s):
newick_quote_level = 0
if " " in s:
newick_quote_level = 1
for char in "[]();:,'_":
if char in s:
newick_quote_level = 2
if newick_quote_level == 2:
return "'" + s.replace("'","''") + "'"
elif newick_quote_level == 1:
return s.replace(" ","_")
else:
return s
class Tree(object):
def __init__(self,name=None,length=None,children=None):
self.name=name
self.length=length
self.children=children
def _show(self):
s = ""
if self.children:
cs = [child._show() for child in self.children]
s += "("+",".join(cs)+")"
if self.name is not None:
s += newick_quote(self.name)
if self.length is not None:
s += ":" + f"{self.length}"
return s
def show(self):
return self._show()+";"
def __str__(self):
return self.show()
# Floating-point literals
decimals = regex(r'[0-9]+')
# This seems like it should be part of the library
@generate
def floating():
sign = string("+")|string("-")
exponent = (string("e")|string("E"))+option(sign,"")+decimals
s = yield option(sign,"")
i1 = yield decimals
i2 = yield option(string(".")+decimals,"")
i3 = yield option(exponent,"")
result = s+i1+i2+i3
return float(result)
def concat_str(strings):
return "".join(strings)
# try `parser` and return `value` if it is not there.
def option(parser,value=None):
return parser | success(value)
# Lexer part 1: space parser
comment = string("[")>>regex(r"[^\]]*")>>string("]").desc("comment")
whitespace = regex(r"[ \t\n]+").desc("whitespace")
spaces = (comment | whitespace).many() | success("")
# Lexer part 2: in quoted strings, "''" changes to "'"
quoted_char = string("''") >> success("'") | regex(r"[^']")
quoted_label = (string("'") >> quoted_char.many().map(concat_str) << string("'")).desc('quoted_label')
# Lexer part 3: in UNquoted string, "_" changes to " "
unquoted_label = (string("_") >> success(" ")
| regex(r"[^ ()\[\]':;,]")
).many().map(concat_str).desc('unquoted_label')
# Parser
label = quoted_label | unquoted_label
branch_length = string(":") >> spaces >> option(floating) << spaces
@generate
def subtree():
children = yield option(descendant_list, [])
yield spaces
node_label = yield option(label)
yield spaces
length = yield option(branch_length)
return Tree(node_label, length, children)
@generate
def descendant_list():
yield spaces
yield string("(")
children = yield subtree.sep_by(string(","))
yield string(")")
return children
@generate
def tree():
yield spaces
t = yield subtree
yield spaces
yield string(";")
yield spaces
return t
def check(i,o):
print(f'Checking "{i}" -> "{o}"')
if str(tree.parse(i)) != o:
print(" FAIL: got " + str(tree.parse(i)))
def check_error(i):
print(f'Checking "{i}" does not parse: ',end="")
try:
str(tree.parse(i))
print("Parses, but should not!")
except:
print("OK")
if len(sys.argv)>1:
print(tree.parse(sys.argv[1]))
else:
check("('a b',(b,c));", "(a_b,(b,c));")
check("(a_b,(b,c));", "(a_b,(b,c));")
check("('a_b',(b,c));", "('a_b',(b,c));")
check("('a''b',(b,c));", "('a''b',(b,c));")
check("('a ,b',(b,c));", "('a ,b',(b,c));")
check("('a ,b'[yo dude],(b,c));", "('a ,b',(b,c));")
check("('a ,b'[yo dude]:[hi!]0.1,(b,c));", "('a ,b':0.1,(b,c));")
check("(a?b : 0.1 , (b , c ));", "(a?b:0.1,(b,c));")
check(";", ";")
check("();", "();")
check("(,);", "(,);")
check("((,),);", "((,),);")
check("((a,a)a,a)a;", "((a,a)a,a)a;")
check("a;", "a;")
check("a:0.1;", "a:0.1;")
check("a:1.0;", "a:1.0;")
check("a:1;", "a:1.0;")
check("a[node]:0.1[branch];", "a:0.1;")
check_error("(a''b,(b,c));")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment