Last active
September 22, 2022 09:55
-
-
Save moser/cbf5eb26a8111cd35811987b865e72b7 to your computer and use it in GitHub Desktop.
parse_text_interpolation.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
pip install parsimonious | |
""" | |
import dataclasses as _dc | |
from parsimonious.grammar import Grammar | |
from parsimonious.nodes import NodeVisitor | |
TEXT = """ | |
Here are some {variable}s. I don't care about spaces in the { interpolations}. | |
More lines... | |
{and_one_more} | |
Linebreaks are not ok in the curlies: { | |
i_will_not_be_recognized_as_var} | |
""" | |
main_grammar = Grammar( | |
r""" | |
expr = (interpolation / text)* | |
interpolation = LPAR var_usage RPAR | |
text = TEXT / LPAR / RPAR | |
var_usage = space* VAR_NAME space* | |
space = " " | |
VAR_NAME = ~"[a-zA-Z0-9_]+" | |
TEXT = ~"[^{}]+" | |
LPAR = ~"{" | |
RPAR = "}" | |
""" | |
) | |
class MainVisitor(NodeVisitor): | |
def visit_expr(self, node, visited_children): | |
return Node(children=list(flatten_list(visited_children))) | |
def visit_text(self, node, visited_children): | |
return TextNode(node.text) | |
def visit_interpolation(self, node, visited_children): | |
# according to grammar contains exactly one "var_name" | |
return InterpolationNode(var_name=[child.name for child in visited_children if isinstance(child, VarName)][0]) | |
def visit_var_usage(self, node, visited_children): | |
# according to grammar contains exactly one "var_name" | |
return [child for child in visited_children if isinstance(child, VarName)][0] | |
def visit_VAR_NAME(self, node, visited_children): | |
return VarName(name=node.text) | |
def generic_visit(self, node, visited_children): | |
return visited_children or node | |
def flatten_list(lst): | |
for item in lst: | |
if isinstance(item, list): | |
yield from flatten_list(item) | |
else: | |
yield item | |
@_dc.dataclass | |
class Node: | |
children: list | |
def flatten(self): | |
return [child.flatten() for child in self.children] | |
@_dc.dataclass | |
class TextNode: | |
text: str | |
def flatten(self): | |
return self | |
@_dc.dataclass | |
class InterpolationNode: | |
var_name: str | |
def flatten(self): | |
return self | |
@_dc.dataclass | |
class VarName: | |
name: str | |
tree = main_grammar.parse(TEXT) | |
out = MainVisitor().visit(tree) | |
print(out.flatten()) | |
print("Used vars") | |
print([node.var_name for node in out.flatten() if isinstance(node, InterpolationNode)]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Paste the grammar below in the online peggy parser builder: https://peggyjs.org/online.html | |
And use this as the test text: | |
Here are some {variable}s. I don't care about spaces in the { interpolations}. | |
More lines... | |
{and_one_more} | |
Linebreaks are not ok in the curlies: { | |
i_will_not_be_recognized_as_var} | |
*/ | |
expr | |
= one_expr * | |
one_expr | |
= interp:interpolation { return interp } | |
/ text { return {type:"text", text: text()} } | |
interpolation | |
= LPAR var_use:var_usage RPAR { return var_use } | |
text | |
= TEXT / LPAR / RPAR { return text() } | |
var_usage | |
= space* var_name:VAR_NAME space* { return {type:"var", var: var_name} } | |
space | |
= " " | |
VAR_NAME | |
= [a-zA-Z0-9_] + { return text() } | |
TEXT | |
= [^{}]+ | |
LPAR | |
= "{" | |
RPAR | |
= "}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment