Skip to content

Instantly share code, notes, and snippets.

@cjsmeele
Last active September 7, 2020 21:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cjsmeele/4e8107e870519b2298821a3639d49d7d to your computer and use it in GitHub Desktop.
Save cjsmeele/4e8107e870519b2298821a3639d49d7d to your computer and use it in GitHub Desktop.
Parser combinators in Python3 (bad)
# Parser combinators in Python3 - Chris Smeele, 2020.
#
# This code works. Just not very good, due to it mapping very badly to
# Python's execution model (it blows up with recursion depth).
# It's also incredibly ugly, since Python very much does not want you to write
# code in this style.
# Creating it was a nice exercise however.
#
# I hereby release this awful code into the public domain.
# Please leave it there and don't try to actually use it :-)
#
# As a test, this implements a sort-of XML parser (tag soup) in two steps:
# (1) text -> tokens (2) tokens -> elements
#
# Types for the stuff we'll be parsing {{{
class Element():
def __init__(self, name, body):
if type(body) is not list:
body = decode_entities(body)
self.name = name
self.body = body
self.text = body
def find(self, name):
for x in self.findall(name):
return x
def findall(self, name):
return list(self._findall(name))
def _findall(self, name):
for el in self.body:
if el.name == name:
yield el
def __str__(self):
if type(self.body) is list:
return '<{}>{}</{}>'.format(self.name, ''.join(map(str, self.body)), self.name)
else:
return '<{}>{}</{}>'.format(self.name, encode_entities(self.body), self.name)
def __repr__(self):
return '{}({})'.format(self.name, repr(self.body))
class Token():
def __init__(self, s):
self.text = s
def __repr__(self):
return str(type(self).__name__) + '(' + self.text.decode('utf-8') + ')'
def __str__(self):
return repr(self)
class TokenElOpen(Token): pass
class TokenElClose(Token): pass
class TokenCData(Token): pass
# }}}
# A parser is a function s -> (a, s)
# Execute a parser.
def parse(p, s):
return p(s)
# Execute a parser, return None unless all input was succesfully consumed.
def parse_(p, s):
val, rest = parse(p, s)
if len(rest) != 0:
return None
else:
return val
# >>=, for the parser monad.
def pbind(p, f):
def g(s):
x, rest = parse(p, s)
if x is None:
return None, s
y, rest = f(x)(rest)
if y is None:
return None, s
return y, rest
return g
ppure = lambda x: lambda s: (x, s)
def pmap(g, p):
return pbind(p, lambda x: ppure(g(x)))
# Why does python's builtin `reduce` not specify if it folds left or right? grrr.
def foldl(f, init, xs):
if xs == []:
return init
return foldl(f, f(init, xs[0]), xs[1:])
# Try to create new syntax within Python. That will end well.
pdo = lambda p, *fs: foldl(pbind, p, list(fs))
# Some sexy mutual recursion.
pmany = lambda p: palt(psome(p), ppure([]))
psome = lambda p: pbind(p, lambda x: pmap(lambda xs: [x] + xs, pmany(p)))
# <|>
def palt(*ps):
def f(s):
for p in ps:
val, rest = p(s)
if val is not None:
return val, rest
return None, s
return f
# this might be pushing it a bit too far...
# pitem = lambda: lambda s: ((None, s) if len(s) == 0 else ((bytes([s[0]]) if type(s) is bytes else s[0]), s[1:]))
def pitem():
def f(s):
if len(s) == 0:
return None, s
elif type(s) is bytes:
return bytes([s[0]]), s[1:]
else:
return s[0], s[1:]
return f
# Here be dragons.
psat = lambda f: pbind(pitem(), lambda x: ppure(x) if f(x) else ppure(None))
pchar = lambda expect: psat(lambda x: x == expect)
pspace = lambda: psat(lambda x: x in b' \n\t\r')
pspace_before = lambda p: pbind(pmany(pspace()), lambda _: p)
pspace_after = lambda p: pbind(p, lambda x: pbind(pmany(pspace()), lambda _: ppure(x)))
pspace_around = lambda p: pspace_after(pspace_before(p))
# Token parser {{{
# element open. (<...>)
pTopen = lambda: pmap(TokenElOpen,
pspace_before(pdo( pchar(b'<'),
lambda _: psome(psat(lambda x: x not in b'/>')),
lambda tag: pbind(pchar(b'>'),\
lambda _: ppure(b''.join(tag))))))
# element close. (</...>)
pTclose = lambda: pmap(TokenElClose,
pspace_around(pdo( pchar(b'<'),
lambda _: pchar(b'/'),
lambda _: psome(psat(lambda x: x != b'>')),
lambda tag: pbind(pchar(b'>'),
lambda _: ppure(b''.join(tag))))))
# textual element body (...)
pTcdata = lambda: pmap(TokenCData,
pbind(psome(psat(lambda x: x != b'<')),
lambda chars: ppure(b''.join(chars))))
pT = lambda: palt(pTopen(), pTclose(), pTcdata())
# }}}
# Element parser (token -> Element) {{{
popen = lambda: pmap(lambda x: x.text.decode('utf-8'), psat(lambda x: type(x) == TokenElOpen))
pclose = lambda s: psat(lambda x: type(x) == TokenElClose and x.text.decode('utf-8') == s)
pcdata = lambda: pmap(lambda x: x.text, psat(lambda x: type(x) == TokenCData))
pelem_body = lambda: palt(pmap(lambda x: x.decode('utf-8'), pcdata()), pmany(pelem()))
pelem = lambda: pbind(popen(),
lambda name: pbind(pelem_body(),
lambda body: pbind(pclose(name),
lambda _: ppure(Element(name, body)))))
# }}}
pxml = pelem
entities = [('&', '&amp;'),
('<', '&lt;'),
('>', '&gt;')]
def encode_entities(s):
for k, v in entities:
s = s.replace(k, v)
return s
def decode_entities(s):
rev = [x for x in entities]
rev.reverse()
for k, v in rev:
s = s.replace(v, k)
return s
def fromstring(s):
if type(s) is str:
s = s.encode('utf-8')
# What is tail recursion?
import sys
r = sys.getrecursionlimit()
# sys.setrecursionlimit(4000)
tokens = parse_(pmany(pT()), s)
#print(tokens)
if tokens is None:
return None
tags = parse_(pxml(), tokens)
sys.setrecursionlimit(r)
# print(str(tags))
return tags
# Converts to an element tree, and back to a string for printing.
#
print(fromstring("<a>\n\
<b>bloep</b>\n\
<b>bloep</b>\n\
<b>bloep</b>\n <b>bl&amp;ep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b> \
<c>blarp</c> </a>"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment