Created
September 17, 2020 23:32
-
-
Save aialenti/6f7ba631d5958799be19095a3b530515 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql.functions import udf | |
from pyspark.sql.types import ArrayType, StringType | |
# Create the function that will be used in the UDF | |
def array_repeat_custom(element, count): | |
list = ["{}".format(element) for x in range(0, count)] | |
return list | |
# Convert the function into a UDF. It's good practice to indicate the return type of the UDF | |
# In this case the return type is an array of strings | |
array_repeat_custom_udf = udf(array_repeat_custom, ArrayType(StringType())) | |
# Read the source tables in Parquet format | |
sales_table = spark.read.parquet("./data/sales_parquet") | |
# Call the UDF. | |
# Note the `lit(3)` syntax to declare how many times to repeat the array element: | |
# Our UDF only accepts columns as input. The `lit` operator | |
# is very useful in Spark to convert a constant into a "spark column" | |
sales_table_execution_plan = sales_table.select( | |
array_repeat_custom_udf(col("num_pieces_sold"), lit(3)).alias("sample_array") | |
) | |
sales_table_execution_plan.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment