Created
August 30, 2018 21:58
-
-
Save goodmami/686385b4b39a3bac00fbbe78a5cda6c8 to your computer and use it in GitHub Desktop.
Comparing Lark and Parsimonious on JSON parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# usage: python3 lark-parsimonious.py [TESTNUM] | |
# | |
# Where TESTNUM is one of: | |
# | |
# 1. Parsimonious with the faster grammar (tree-only) | |
# 2. Parsimonious with the faster grammar (transform data) | |
# 3. Parsimonious with the slower grammar (tree-only) | |
# 4. Parsimonious with the slower grammar (transform data) | |
# 5. Lark with LALR (tree-only) | |
# 6. Lark with LALR (tree-less transformation) | |
# 7. Lark with LALR (tree and transformation) | |
# 8. json module from the Python standard library | |
# | |
# If TESTNUM is not given, all tests are run. | |
# | |
# Also, it expects a file "generated.json" to be in the current directory. | |
# Such a file can be created here: | |
# https://www.json-generator.com/ | |
# | |
# Requirements: | |
# * lark-parser | |
# * parsimonious | |
# | |
# Author: Michael Wayne Goodman | |
# Note that the original (slower) Parsimonious grammar is from: | |
# https://gist.github.com/reclosedev/5222560 | |
# And the Lark grammar is from: | |
# https://github.com/lark-parser/lark/blob/master/docs/json_tutorial.md | |
# See these URLs for any license restrictions of the respective sources. | |
import argparse | |
import ast | |
import timeit | |
from parsimonious.grammar import Grammar | |
from parsimonious.nodes import NodeVisitor | |
from lark import Lark, Transformer, v_args | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument('testnum', type=int, nargs='?', default=0) | |
args = argparser.parse_args() | |
ParsimoniousJson1 = Grammar(r''' | |
Start = ~"\s*" ( Object / Array ) ~"\s*" | |
Object = ~"{\s*" Members? ~"\s*}" | |
Members = MappingComma* Mapping | |
MappingComma = Mapping ~"\s*,\s*" | |
Mapping = DQString ~"\s*:\s*" Value | |
Array = ~"\[\s*" Items? ~"\s*\]" | |
Items = ValueComma* Value | |
ValueComma = Value ~"\s*,\s*" | |
Value = Object / Array / DQString | |
/ TrueVal / FalseVal / NullVal / Float / Integer | |
TrueVal = "true" | |
FalseVal = "false" | |
NullVal = "null" | |
DQString = ~"\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"" | |
Float = ~"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?" | |
Integer = ~"[-+]?\d+" | |
''') | |
class ParsimoniousJson1Visitor(NodeVisitor): | |
def generic_visit(self, node, visited_children): | |
return visited_children or node | |
# helper functions for generic patterns | |
def combine_many_or_one(self, node, children): | |
""" Usable for following pattern: | |
values = value_and_comma* value | |
""" | |
members, member = children | |
if isinstance(members, list): | |
return members + [member] | |
return [member] | |
def lift_first_child(self, node, visited_children): | |
""" Returns first child from `visited_children`, e.g. for:: | |
rule = item optional another_optional? | |
returns `item` | |
""" | |
return visited_children[0] | |
# visitors | |
visit_Value = visit_MappingComma = visit_ValueComma = lift_first_child | |
visit_Members = combine_many_or_one | |
def visit_Start(self, node, children): | |
return children[1] | |
def visit_Object(self, node, children): | |
_, members, _ = children | |
if isinstance(members, list): | |
members = members[0] | |
else: | |
members = [] | |
return dict(members) | |
def visit_Array(self, node, children): | |
_, values, _ = children | |
if isinstance(values, list): | |
values = values[0] | |
else: | |
values = [] | |
return values | |
def visit_Mapping(self, node, children): | |
key, _, value = children | |
return key, value | |
def visit_DQString(self, node, visited_children): | |
# produce unicode for strings | |
return ast.literal_eval("u" + node.text) | |
def visit_Float(self, node, visited_children): | |
return float(node.text) | |
def visit_Integer(self, node, visited_children): | |
return int(node.text) | |
def visit_TrueVal(self, node, visited_children): | |
return True | |
def visit_FalseVal(self, node, visited_children): | |
return False | |
def visit_NullVal(self, node, visited_children): | |
return None | |
# taken from https://gist.github.com/reclosedev/5222560 | |
# Fixed number to allow 1e2 floats | |
# Changed NodeVisitor to work with Python3 | |
ParsimoniousJson2 = Grammar(r''' | |
json_file = ws? json ws? | |
json = object / array | |
object = "{" members "}" | |
members = member_and_comma* member | |
member_and_comma = member comma | |
member = ws? string ws? ":" value | |
array = "[" values "]" | |
values = value_and_comma* value | |
value_and_comma = value comma | |
value = ws? (true / false / object / array / number / string / null) ws? | |
true = "true" | |
false = "false" | |
null = "null" | |
number = ~r"-?(0|([1-9][0-9]*))(\.[0-9]+)?([Ee][+-]?[0-9]+)?" | |
string = ~"\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is | |
ws = ~r"\s+" | |
comma = ws? "," ws?''') | |
class ParsimoniousJson2Visitor(NodeVisitor): | |
""" Produces Python objects from parsed JSON grammar tree | |
""" | |
def generic_visit(self, node, visited_children): | |
return visited_children or node | |
# helper functions for generic patterns | |
def combine_many_or_one(self, node, children): | |
""" Usable for following pattern: | |
values = value_and_comma* value | |
""" | |
members, member = children | |
if isinstance(members, list): | |
return members + [member] | |
return [member] | |
def lift_first_child(self, node, visited_children): | |
""" Returns first child from `visited_children`, e.g. for:: | |
rule = item optional another_optional? | |
returns `item` | |
""" | |
return visited_children[0] | |
# visitors | |
visit_json = lift_first_child | |
def visit_json_file(self, node, children): | |
eol1, json_, eol = children | |
return json_ | |
def visit_object(self, node, children): | |
cb1, members, cb2 = children | |
return dict(members) | |
def visit_array(self, node, children): | |
cb1, values, cb2 = children | |
return values | |
visit_member_and_comma = visit_value_and_comma = lift_first_child | |
visit_values = visit_members = combine_many_or_one | |
def visit_member(self, node, children): | |
_1, name, _2, colon, value = children | |
return name, value | |
def visit_value(self, node, children): | |
_1, value, _2 = children | |
return value[0] | |
def visit_string(self, node, visited_children): | |
# produce unicode for strings | |
return ast.literal_eval("u" + node.text) | |
def visit_number(self, node, visited_children): | |
return ast.literal_eval(node.text) | |
def visit_true(self, node, visited_children): | |
return True | |
def visit_false(self, node, visited_children): | |
return False | |
def visit_null(self, node, visited_children): | |
return None | |
# from https://github.com/lark-parser/lark/blob/master/docs/json_tutorial.md | |
lark_json_grammar = r""" | |
?start: value | |
?value: object | |
| array | |
| string | |
| SIGNED_NUMBER -> number | |
| "true" -> true | |
| "false" -> false | |
| "null" -> null | |
array : "[" [value ("," value)*] "]" | |
object : "{" [pair ("," pair)*] "}" | |
pair : string ":" value | |
string : ESCAPED_STRING | |
%import common.ESCAPED_STRING | |
%import common.SIGNED_NUMBER | |
%import common.WS | |
%ignore WS | |
""" | |
class TreeToJson(Transformer): | |
@v_args(inline=True) | |
def string(self, s): | |
return s[1:-1].replace('\\"', '"') | |
array = list | |
pair = tuple | |
object = dict | |
number = v_args(inline=True)(float) | |
null = lambda self, _: None | |
true = lambda self, _: True | |
false = lambda self, _: False | |
LarkJson = Lark(lark_json_grammar, parser='lalr', lexer='standard') | |
LarkJsonTreeless = Lark( | |
lark_json_grammar, parser='lalr', lexer='standard', transformer=TreeToJson()) | |
s = open('generated.json').read() | |
if args.testnum in (0, 1): | |
print( | |
'Parsimonious 1 (faster grammar; tree only)\n ', | |
timeit.timeit( | |
'ParsimoniousJson1.match(s)', | |
setup='from __main__ import ParsimoniousJson1, s', | |
number=1 | |
) | |
) | |
if args.testnum in (0, 2): | |
print( | |
'parsimonious 1 (faster grammar; transformed data)\n ', | |
timeit.timeit( | |
'v.visit(ParsimoniousJson1.match(s))', | |
setup='from __main__ import ParsimoniousJson1, s, ParsimoniousJson1Visitor; v = ParsimoniousJson1Visitor()', | |
number=1 | |
) | |
) | |
if args.testnum in (0, 3): | |
print( | |
'parsimonious 2 (original grammar; tree only)\n ', | |
timeit.timeit( | |
'ParsimoniousJson2.match(s)', | |
setup='from __main__ import ParsimoniousJson2, s', | |
number=1 | |
) | |
) | |
if args.testnum in (0, 4): | |
print( | |
'parsimonious 2 (original grammar; transformed data)\n ', | |
timeit.timeit( | |
'v.visit(ParsimoniousJson2.match(s))', | |
setup='from __main__ import ParsimoniousJson2, s, ParsimoniousJson2Visitor; v = ParsimoniousJson2Visitor()', | |
number=1 | |
) | |
) | |
if args.testnum in (0, 5): | |
print( | |
'lark (lalr; tree only)\n ', | |
timeit.timeit( | |
'LarkJson.parse(s)', | |
setup='from __main__ import LarkJson, s', | |
number=1 | |
) | |
) | |
if args.testnum in (0, 6): | |
print( | |
'lark (lalr; tree-less transformation)\n ', | |
timeit.timeit( | |
'LarkJsonTreeless.parse(s)', | |
setup='from __main__ import LarkJsonTreeless, s', | |
number=1 | |
) | |
) | |
if args.testnum in (0, 7): | |
print( | |
'lark (lalr; tree and transformation)\n ', | |
timeit.timeit( | |
't.transform(LarkJson.parse(s))', | |
setup='from __main__ import LarkJson, TreeToJson, s; t = TreeToJson()', | |
number=1 | |
) | |
) | |
if args.testnum in (0, 8): | |
print( | |
'json (Python standard library)\n ', | |
timeit.timeit( | |
'json.loads(s)', | |
setup='from __main__ import s; import json', | |
number=1 | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment