aialenti/udfs.py

## udfs.py
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

#   Create the function that will be used in the UDF
def array_repeat_custom(element, count):
  list = ["{}".format(element) for x in range(0, count)]
  return list

#   Convert the function into a UDF. It's good practice to indicate the return type of the UDF
#   In this case the return type is an array of strings
array_repeat_custom_udf = udf(array_repeat_custom,  ArrayType(StringType()))

#   Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")

#   Call the UDF.
#   Note the `lit(3)` syntax to declare how many times to repeat the array element:
#   Our UDF only accepts columns as input. The `lit` operator
#   is very useful in Spark to convert a constant into a "spark column"
sales_table_execution_plan = sales_table.select(
    array_repeat_custom_udf(col("num_pieces_sold"), lit(3)).alias("sample_array")
)

sales_table_execution_plan.show()
	from pyspark.sql.functions import udf
	from pyspark.sql.types import ArrayType, StringType

	# Create the function that will be used in the UDF
	def array_repeat_custom(element, count):
	list = ["{}".format(element) for x in range(0, count)]
	return list

	# Convert the function into a UDF. It's good practice to indicate the return type of the UDF
	# In this case the return type is an array of strings
	array_repeat_custom_udf = udf(array_repeat_custom, ArrayType(StringType()))

	# Read the source tables in Parquet format
	sales_table = spark.read.parquet("./data/sales_parquet")

	# Call the UDF.
	# Note the `lit(3)` syntax to declare how many times to repeat the array element:
	# Our UDF only accepts columns as input. The `lit` operator
	# is very useful in Spark to convert a constant into a "spark column"
	sales_table_execution_plan = sales_table.select(
	array_repeat_custom_udf(col("num_pieces_sold"), lit(3)).alias("sample_array")
	)

	sales_table_execution_plan.show()