Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from utils.scala_functions import find_matching_patterns
from pyspark.sql import functions as F
regexes = regex.agg(F.collect_list(F.col("pattern"))).collect()[0][0]
regexes = sc.broadcast(regexes)
articles = articles \
.withColumn("patterns", find_matching_patterns(F.col("text"), regexes.value)
.withColumn("patterns", F.when(F.col("patterns").isNull(), F.array(F.lit(None))).otherwise(F.col("patterns"))) \
.withColumn("pattern", F.explode(F.col("patterns")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.