Created
October 1, 2012 18:46
-
-
Save mavc/3813659 to your computer and use it in GitHub Desktop.
pyparsing and ply demonstration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from ply import lex, yacc | |
from ply.lex import TOKEN | |
import pyparsing as pp | |
# First, let's define a pyparsing parser for JSON. | |
class JSONPyParsing(object): | |
# pylint: disable-msg=W0104,E0213 | |
# JSON Elements ("expressions") are either null, booleans, | |
# strings, numbers, objects, or arrays | |
null = pp.Keyword('null').setParseAction(pp.replaceWith(None)) | |
boolean = ( | |
pp.Keyword('true').setParseAction(pp.replaceWith(True)) | | |
pp.Keyword('false').setParseAction(pp.replaceWith(False)) | |
) | |
# A real JSON string has unicode escape sequences and escaped | |
# control sequences, but we'll ignore them here. | |
string = pp.QuotedString('"', escChar='\\') | |
# A number is defined as [+-]?(0|[1-9][0-9]*)([.][0-9]*)?([eE][+-]?[0-9]+)? | |
number = pp.Regex( | |
r"""[+-]? | |
(0 | [1-9][0-9]*) | |
(?P<float> | |
([.][0-9]*)? | |
([eE][+-]?[0-9]+)? | |
)""", | |
flags=re.X | |
) | |
# A more verbose way to define numbers: | |
# plusminus = pp.oneOf('+ -') | |
# number = pp.Combine( | |
# pp.Optional(plusminus) + | |
# ('0' | pp.Word('123456789', pp.nums)) + | |
# (pp.Optional(pp.Word('.', pp.nums)) + | |
# pp.Optional(pp.CaselessLiteral('e') + | |
# pp.Optional(plusminus) + | |
# pp.Word(pp.nums)))('float') | |
# ) | |
# Define how to process these elements. | |
def number_action(tok): | |
# pylint: disable-msg=E1101 | |
if tok.float: return [float(tok[0])] | |
return [int(tok[0])] | |
number.setParseAction(number_action) | |
# Objects and Arrays are defined in terms of expr, and expr is | |
# defined in terms of objects and arrays. When elements are | |
# defined recursively, use Forward() as a placeholder and << to | |
# define the value later. | |
expr = pp.Forward() | |
# Arrays are defined as '[' (expr {',' expr}) ']' | |
arr = ( | |
pp.Suppress('[') + | |
pp.Optional(pp.delimitedList(expr)('elements')) + | |
pp.Suppress(']') | |
) | |
# Objects are defined as '{' (string ':' expr {',' string: expr}) '}' | |
keyval = string('key') + pp.Suppress(':') + expr('value') | |
obj = ( | |
pp.Suppress('{') + | |
pp.Optional(pp.delimitedList(pp.Group(keyval))('elements')) + | |
pp.Suppress('}') | |
) | |
# Set actions to convert them into the equivalent Python types. | |
def arr_action(tok): | |
# pylint: disable-msg=E1101 | |
return [list(tok.elements)] | |
def obj_action(tok): | |
# pylint: disable-msg=E1101 | |
return [dict((t.key, t.value) for t in tok.elements)] | |
obj.setParseAction(obj_action) | |
arr.setParseAction(arr_action) | |
# Now that we have defined obj and arr in terms of expr, we can | |
# "fill in" the definition of expr. | |
expr << (null | boolean | string | number | obj | arr) | |
# A valid JSON Document has either an object or array at the top level | |
json = obj | arr | |
@staticmethod | |
def loads(json_string): | |
return JSONPyParsing.json.parseString(json_string, parseAll=True)[0] | |
# Now, let's define a PLY parser for JSON. | |
# PLY requires us to define a 'lexer', which transforms an input | |
# string into tokens, and a 'parser', which transforms streams of | |
# tokens into an AST. | |
class JSONPlyLexer(object): | |
states = ( | |
('str', 'exclusive'), | |
('esc', 'exclusive') | |
) | |
tokens = ( | |
'INT', | |
'FLOAT', | |
'NULL', | |
'TRUE', | |
'FALSE', | |
'LBRACE', | |
'RBRACE', | |
'LBRACKET', | |
'RBRACKET', | |
'COMMA', | |
'COLON', | |
'DBLQUOTE', | |
'CHAR' | |
) | |
def __init__(self): | |
self.lexer = lex.lex(module=self) | |
def __iter__(self): | |
return iter(self.lexer) | |
def token(self): | |
return self.lexer.token() | |
# Lexing rules as variables are checked in order of longest length. | |
t_LBRACE = r'{' | |
t_RBRACE = r'}' | |
t_LBRACKET = r'\[' | |
t_RBRACKET = r'\]' | |
t_COMMA = r',' | |
t_COLON = r':' | |
t_TRUE = r'true' | |
t_FALSE = r'false' | |
t_NULL = r'null' | |
# Lexing rules in methods are specified using regex docstrings, | |
# and checked in the order that they are defined. | |
def t_FLOAT(self, t): | |
r'(0|[1-9][0-9]*)([.][0-9]*)?([eE][+-]?[0-9]+)?' | |
if '.' in t.value or 'e' in t.value: | |
t.value = float(t.value) | |
else: | |
t.type = 'INT' | |
t.value = int(t.value) | |
return t | |
def t_DBLQUOTE(self, t): | |
r'"' | |
self.lexer.push_state('str') | |
return t | |
def t_esc_DBLQUOTE(self, t): | |
r'"' | |
self.lexer.pop_state() | |
t.value = u'"' | |
t.type = 'CHAR' | |
return t | |
def t_esc_CHAR(self, t): | |
'b|n|r|t|f|u[0-9]{4}' | |
self.lexer.pop_state() | |
t.value = eval('u"\\%s"' % t.value) | |
return t | |
def t_esc_error(self, t): | |
raise SyntaxError(t) | |
def t_str_CHAR(self, t): | |
r'.' | |
if t.value == '\\': | |
self.lexer.push_state('esc') | |
return None | |
elif t.value == '"': | |
self.lexer.pop_state() | |
t.type = "DBLQUOTE" | |
return t | |
elif t.value not in ('\b\r\t\f\n'): | |
t.value = unicode(t.value) | |
return t | |
else: | |
raise ValueError(t.value + ' not a valid string character') | |
def t_str_error(self, t): | |
raise SyntaxError(t) | |
def t_ignore_whitespace(self, t): | |
r'\s+' | |
pass | |
def t_error(self, t): | |
raise SyntaxError(t) | |
def input(self, data): | |
self.lexer.input(data) | |
class JSONPlyParser(object): | |
# The Lexer and Parser must share the same set of tokens. | |
tokens = JSONPlyLexer.tokens | |
def __init__(self): | |
self.lexer = JSONPlyLexer() | |
self.parser = yacc.yacc(module=self) | |
# The first rule specified is the top level rule. | |
# Rules are defined in each docstring in BNF. | |
# Rules should be left-factored: No two rules | |
# should share a prefix longer than 1. | |
def p_json(self, p): | |
''' | |
json : obj | |
| arr | |
''' | |
p[0] = p[1] | |
def p_expr(self, p): | |
''' | |
expr : NULL | |
| TRUE | |
| FALSE | |
| INT | |
| FLOAT | |
| string | |
| obj | |
| arr | |
''' | |
p[0] = p[1] | |
def p_string(self, p): | |
''' | |
string : DBLQUOTE DBLQUOTE | |
| DBLQUOTE chars DBLQUOTE | |
''' | |
if len(p) == 3: | |
p[0] = u'' | |
else: | |
p[0] = p[2] | |
def p_chars(self, p): | |
''' | |
chars : CHAR | |
| chars CHAR | |
''' | |
if len(p) == 1: | |
p[0] = u"" | |
elif len(p) == 2: | |
p[0] = p[1] | |
else: | |
p[0] = p[1] + p[2] | |
def p_obj(self, p): | |
''' | |
obj : LBRACE RBRACE | |
| LBRACE members RBRACE | |
''' | |
p[0] = {} | |
if len(p) > 3: | |
p[0].update(p[2]) | |
def p_arr(self, p): | |
''' | |
arr : LBRACKET RBRACKET | |
| LBRACKET elements RBRACKET | |
''' | |
p[0] = [] | |
if len(p) > 3: | |
p[0].extend(p[2]) | |
def p_members(self, p): | |
''' | |
members : string COLON expr | |
| members COMMA string COLON expr | |
''' | |
if len(p) == 4: | |
p[0] = [(p[1], p[3])] | |
else: | |
p[0] = p[1] | |
p[0].append((p[3], p[5])) | |
def p_elements(self, p): | |
''' | |
elements : expr | |
| elements COMMA expr | |
''' | |
if len(p) == 2: | |
p[0] = [p[1]] | |
else: | |
p[0] = p[1] | |
p[0].append(p[3]) | |
def p_error(self, p): | |
raise SyntaxError(p) | |
def parse(self, text): | |
return self.parser.parse(text, self.lexer) | |
instance = None | |
@staticmethod | |
def loads(json_str): | |
if JSONPlyParser.instance is None: | |
JSONPlyParser.instance = JSONPlyParser() | |
return JSONPlyParser.instance.parse(json_str) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment