Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dineshdharme/5f72301ffb27a280fb9f71c95604e252 to your computer and use it in GitHub Desktop.
Save dineshdharme/5f72301ffb27a280fb9f71c95604e252 to your computer and use it in GitHub Desktop.
Parsing Boolean Expression Using Lark.
https://stackoverflow.com/questions/78272962/split-strings-containing-nested-brackets-in-spark-sql
It is very easy to do with lark python library.
$ `pip install lark --upgrade`
Then you need to create a grammar which is able to parse your expressions.
Following is the script :
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("NoUnstack").getOrCreate()
schema = StructType([
StructField("exp_id", IntegerType(), True),
StructField("boolean_expression", StringType(), True),
])
data = [
(1, "A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)"),
(2, "(A2 AND A3) OR (B1 AND B2)"),
]
df = spark.createDataFrame(data, schema)
df.show(truncate=False)
grammar = """
?start: expression
?expression: atom
| expression "AND" expression -> and_op
| expression "OR" expression -> or_op
| "(" expression ")" -> bracket_exp
?atom: /[A-Z][0-9]/ -> variable
%import common.WS
%ignore WS
"""
def evaluate_exp(expression):
from lark import Lark, Transformer, v_args
class MyTransformer(Transformer):
def __init__(self):
super().__init__()
self.logic_counter = 0
self.transformations = []
def variable(self, items):
return str(items[0])
@v_args(inline=True)
def and_op(self, left, right):
self.logic_counter += 1
result = f"{left} AND {right}"
logic_label = f"logic{self.logic_counter}"
self.transformations.append((result, logic_label))
return logic_label
@v_args(inline=True)
def or_op(self, left, right):
self.logic_counter += 1
result = f"{left} OR {right}"
logic_label = f"logic{self.logic_counter}"
self.transformations.append((result, logic_label))
return logic_label
def bracket_exp(self, items):
return items[0]
def get_transformations(self):
string_repr = []
for original, label in self.transformations:
string_repr.append(f"{original} -> {label}")
return string_repr
parser = Lark(grammar, start='start', parser='lalr')
parsed = parser.parse(expression)
transformer = MyTransformer()
transformer.transform(parsed)
value = transformer.get_transformations()
return value
evaluate_exp_udf = udf(evaluate_exp, ArrayType(StringType()))
df = df.withColumn("ast_tree", evaluate_exp_udf(col("boolean_expression")))
df = df.withColumn("exploded_col", explode(col("ast_tree")))
df.show(n=40, truncate=False)
df.select("exp_id", "exploded_col").show(n=40, truncate=False)
Output :
+------+---------------------------+
|exp_id|exploded_col |
+------+---------------------------+
|1 |A2 AND A3 -> logic1 |
|1 |A4 OR logic1 -> logic2 |
|1 |B1 OR B2 -> logic3 |
|1 |logic2 AND logic3 -> logic4|
|1 |A1 AND logic4 -> logic5 |
|2 |A2 AND A3 -> logic1 |
|2 |B1 AND B2 -> logic2 |
|2 |logic1 OR logic2 -> logic3 |
+------+---------------------------+
Full Output :
+------+-----------------------------------------+
|exp_id|boolean_expression |
+------+-----------------------------------------+
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|
|2 |(A2 AND A3) OR (B1 AND B2) |
+------+-----------------------------------------+
+------+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------+---------------------------+
|exp_id|boolean_expression |ast_tree |exploded_col |
+------+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------+---------------------------+
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|A2 AND A3 -> logic1 |
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|A4 OR logic1 -> logic2 |
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|B1 OR B2 -> logic3 |
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|logic2 AND logic3 -> logic4|
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|A1 AND logic4 -> logic5 |
|2 |(A2 AND A3) OR (B1 AND B2) |[A2 AND A3 -> logic1, B1 AND B2 -> logic2, logic1 OR logic2 -> logic3] |A2 AND A3 -> logic1 |
|2 |(A2 AND A3) OR (B1 AND B2) |[A2 AND A3 -> logic1, B1 AND B2 -> logic2, logic1 OR logic2 -> logic3] |B1 AND B2 -> logic2 |
|2 |(A2 AND A3) OR (B1 AND B2) |[A2 AND A3 -> logic1, B1 AND B2 -> logic2, logic1 OR logic2 -> logic3] |logic1 OR logic2 -> logic3 |
+------+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------+---------------------------+
+------+---------------------------+
|exp_id|exploded_col |
+------+---------------------------+
|1 |A2 AND A3 -> logic1 |
|1 |A4 OR logic1 -> logic2 |
|1 |B1 OR B2 -> logic3 |
|1 |logic2 AND logic3 -> logic4|
|1 |A1 AND logic4 -> logic5 |
|2 |A2 AND A3 -> logic1 |
|2 |B1 AND B2 -> logic2 |
|2 |logic1 OR logic2 -> logic3 |
+------+---------------------------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment