Created
April 4, 2024 15:50
-
-
Save dineshdharme/5f72301ffb27a280fb9f71c95604e252 to your computer and use it in GitHub Desktop.
Parsing Boolean Expression Using Lark.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
https://stackoverflow.com/questions/78272962/split-strings-containing-nested-brackets-in-spark-sql | |
It is very easy to do with lark python library. | |
$ `pip install lark --upgrade` | |
Then you need to create a grammar which is able to parse your expressions. | |
Following is the script : | |
from pyspark.sql.types import * | |
from pyspark.sql import SparkSession | |
from pyspark.sql.functions import * | |
spark = SparkSession.builder.appName("NoUnstack").getOrCreate() | |
schema = StructType([ | |
StructField("exp_id", IntegerType(), True), | |
StructField("boolean_expression", StringType(), True), | |
]) | |
data = [ | |
(1, "A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)"), | |
(2, "(A2 AND A3) OR (B1 AND B2)"), | |
] | |
df = spark.createDataFrame(data, schema) | |
df.show(truncate=False) | |
grammar = """ | |
?start: expression | |
?expression: atom | |
| expression "AND" expression -> and_op | |
| expression "OR" expression -> or_op | |
| "(" expression ")" -> bracket_exp | |
?atom: /[A-Z][0-9]/ -> variable | |
%import common.WS | |
%ignore WS | |
""" | |
def evaluate_exp(expression): | |
from lark import Lark, Transformer, v_args | |
class MyTransformer(Transformer): | |
def __init__(self): | |
super().__init__() | |
self.logic_counter = 0 | |
self.transformations = [] | |
def variable(self, items): | |
return str(items[0]) | |
@v_args(inline=True) | |
def and_op(self, left, right): | |
self.logic_counter += 1 | |
result = f"{left} AND {right}" | |
logic_label = f"logic{self.logic_counter}" | |
self.transformations.append((result, logic_label)) | |
return logic_label | |
@v_args(inline=True) | |
def or_op(self, left, right): | |
self.logic_counter += 1 | |
result = f"{left} OR {right}" | |
logic_label = f"logic{self.logic_counter}" | |
self.transformations.append((result, logic_label)) | |
return logic_label | |
def bracket_exp(self, items): | |
return items[0] | |
def get_transformations(self): | |
string_repr = [] | |
for original, label in self.transformations: | |
string_repr.append(f"{original} -> {label}") | |
return string_repr | |
parser = Lark(grammar, start='start', parser='lalr') | |
parsed = parser.parse(expression) | |
transformer = MyTransformer() | |
transformer.transform(parsed) | |
value = transformer.get_transformations() | |
return value | |
evaluate_exp_udf = udf(evaluate_exp, ArrayType(StringType())) | |
df = df.withColumn("ast_tree", evaluate_exp_udf(col("boolean_expression"))) | |
df = df.withColumn("exploded_col", explode(col("ast_tree"))) | |
df.show(n=40, truncate=False) | |
df.select("exp_id", "exploded_col").show(n=40, truncate=False) | |
Output : | |
+------+---------------------------+ | |
|exp_id|exploded_col | | |
+------+---------------------------+ | |
|1 |A2 AND A3 -> logic1 | | |
|1 |A4 OR logic1 -> logic2 | | |
|1 |B1 OR B2 -> logic3 | | |
|1 |logic2 AND logic3 -> logic4| | |
|1 |A1 AND logic4 -> logic5 | | |
|2 |A2 AND A3 -> logic1 | | |
|2 |B1 AND B2 -> logic2 | | |
|2 |logic1 OR logic2 -> logic3 | | |
+------+---------------------------+ | |
Full Output : | |
+------+-----------------------------------------+ | |
|exp_id|boolean_expression | | |
+------+-----------------------------------------+ | |
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)| | |
|2 |(A2 AND A3) OR (B1 AND B2) | | |
+------+-----------------------------------------+ | |
+------+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------+---------------------------+ | |
|exp_id|boolean_expression |ast_tree |exploded_col | | |
+------+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------+---------------------------+ | |
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|A2 AND A3 -> logic1 | | |
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|A4 OR logic1 -> logic2 | | |
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|B1 OR B2 -> logic3 | | |
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|logic2 AND logic3 -> logic4| | |
|1 |A1 AND (A4 OR (A2 AND A3)) AND (B1 OR B2)|[A2 AND A3 -> logic1, A4 OR logic1 -> logic2, B1 OR B2 -> logic3, logic2 AND logic3 -> logic4, A1 AND logic4 -> logic5]|A1 AND logic4 -> logic5 | | |
|2 |(A2 AND A3) OR (B1 AND B2) |[A2 AND A3 -> logic1, B1 AND B2 -> logic2, logic1 OR logic2 -> logic3] |A2 AND A3 -> logic1 | | |
|2 |(A2 AND A3) OR (B1 AND B2) |[A2 AND A3 -> logic1, B1 AND B2 -> logic2, logic1 OR logic2 -> logic3] |B1 AND B2 -> logic2 | | |
|2 |(A2 AND A3) OR (B1 AND B2) |[A2 AND A3 -> logic1, B1 AND B2 -> logic2, logic1 OR logic2 -> logic3] |logic1 OR logic2 -> logic3 | | |
+------+-----------------------------------------+-----------------------------------------------------------------------------------------------------------------------+---------------------------+ | |
+------+---------------------------+ | |
|exp_id|exploded_col | | |
+------+---------------------------+ | |
|1 |A2 AND A3 -> logic1 | | |
|1 |A4 OR logic1 -> logic2 | | |
|1 |B1 OR B2 -> logic3 | | |
|1 |logic2 AND logic3 -> logic4| | |
|1 |A1 AND logic4 -> logic5 | | |
|2 |A2 AND A3 -> logic1 | | |
|2 |B1 AND B2 -> logic2 | | |
|2 |logic1 OR logic2 -> logic3 | | |
+------+---------------------------+ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment