Skip to content

Instantly share code, notes, and snippets.

@diogoaurelio
Created October 21, 2018 15:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save diogoaurelio/9b6338d257e51cddb2f3015ea2ee17ae to your computer and use it in GitHub Desktop.
Save diogoaurelio/9b6338d257e51cddb2f3015ea2ee17ae to your computer and use it in GitHub Desktop.
PySpark 2.3 example
# create the general function
def _amount_spent(quantity: int, price: float) -> float:
"""
Calculates the product between two variables
:param quantity: (float/int)
:param price: (float/int)
:return:
(float/int)
"""
return quantity * price
def amount_spent_udf(data: DataFrame) -> DataFrame:
# create the general UDF
amount_spent_udf = F.udf(_amount_spent, DoubleType())
# Note: DoubleType in Java/Scala is equal to Python float; thus you can alternatively specify FloatType()
# Apply our UDF to the dataframe
return data.withColumn('amount_spent', amount_spent_udf(F.col('quantity'), F.col('price')))
def main(conf: ConfigParser, spark: SparkSession) -> None:
# mock data
customers = spark.createDataFrame([
Row(customer_name="Geoffrey", date="2016-04-22", category="A", product_name="apples", quantity=1, price=50.00),
Row(customer_name="Geoffrey", date="2016-05-03", category="B", product_name="Lamp", quantity=2, price=38.00),
Row(customer_name="Geoffrey", date="2016-05-03", category="D", product_name="Solar Pannel", quantity=1, price=29.00),
Row(customer_name="Geoffrey", date="2016-05-03", category="A", product_name="apples", quantity=3, price=50.00),
Row(customer_name="Geoffrey", date="2016-05-03", category="C", product_name="Rice", quantity=5, price=15.00),
Row(customer_name="Geoffrey", date="2016-06-05", category="A", product_name="apples", quantity=5, price=50.00),
Row(customer_name="Geoffrey", date="2016-06-05", category="A", product_name="bananas", quantity=5, price=55.00),
Row(customer_name="Geoffrey", date="2016-06-15", category="Y", product_name="Motor skate", quantity=7, price=68.00),
Row(customer_name="Geoffrey", date="2016-06-15", category="E", product_name="Book: The noose", quantity=1, price=125.00),
Row(customer_name="Yann", date="2016-04-22", category="B", product_name="Lamp", quantity=1, price=38.00),
Row(customer_name="Yann", date="2016-05-03", category="Y", product_name="Motor skate", quantity=1, price=68.00),
Row(customer_name="Yann", date="2016-05-03", category="D", product_name="Recycle bin", quantity=5, price=27.00),
Row(customer_name="Yann", date="2016-05-03", category="C", product_name="Rice", quantity=15, price=15.00),
Row(customer_name="Yann", date="2016-04-02", category="A", product_name="bananas", quantity=3, price=55.00),
Row(customer_name="Yann", date="2016-04-02", category="B", product_name="Lamp", quantity=2, price=38.00),
Row(customer_name="Yann", date="2016-04-03", category="E", product_name="Book: Crime and Punishment", quantity=5, price=100.00),
Row(customer_name="Yann", date="2016-04-13", category="E", product_name="Book: The noose", quantity=5, price=125.00),
Row(customer_name="Yann", date="2016-04-27", category="D", product_name="Solar Pannel", quantity=5, price=29.00),
Row(customer_name="Yann", date="2016-05-27", category="D", product_name="Recycle bin", quantity=5, price=27.00),
Row(customer_name="Yann", date="2016-05-27", category="A", product_name="bananas", quantity=3, price=55.00),
Row(customer_name="Yann", date="2016-05-01", category="Y", product_name="Motor skate", quantity=1, price=68.00),
Row(customer_name="Yann", date="2016-06-07", category="Z", product_name="space ship", quantity=1, price=227.00),
Row(customer_name="Yoshua", date="2016-02-07", category="Z", product_name="space ship", quantity=2, price=227.00),
Row(customer_name="Yoshua", date="2016-02-14", category="A", product_name="bananas", quantity=9, price=55.00),
Row(customer_name="Yoshua", date="2016-02-14", category="B", product_name="Lamp", quantity=2, price=38.00),
Row(customer_name="Yoshua", date="2016-02-14", category="A", product_name="apples", quantity=10, price=55.00),
Row(customer_name="Yoshua", date="2016-03-07", category="Z", product_name="space ship", quantity=5, price=227.00),
Row(customer_name="Yoshua", date="2016-04-07", category="Y", product_name="Motor skate", quantity=4, price=68.00),
Row(customer_name="Yoshua", date="2016-04-07", category="D", product_name="Recycle bin", quantity=5, price=27.00),
Row(customer_name="Yoshua", date="2016-04-07", category="C", product_name="Rice", quantity=5, price=15.00),
Row(customer_name="Yoshua", date="2016-04-07",category= "A", product_name="bananas", quantity=9, price=55.00),
Row(customer_name="Jurgen", date="2016-05-01", category="Z", product_name="space ship", quantity=1, price=227.00),
Row(customer_name="Jurgen", date="2016-05-01", category="A", product_name="bananas", quantity=5, price=55.00),
Row(customer_name="Jurgen", date="2016-05-08", category="A", product_name="bananas", quantity=5, price=55.00),
Row(customer_name="Jurgen", date="2016-05-08", category="Y", product_name="Motor skate", quantity=1, price=68.00),
Row(customer_name="Jurgen", date="2016-06-05", category="A", product_name="bananas", quantity=5, price=55.00),
Row(customer_name="Jurgen", date="2016-06-05", category="C", product_name="Rice", quantity=5, price=15.00),
Row(customer_name="Jurgen", date="2016-06-05", category="Y", product_name="Motor skate", quantity=2, price=68.00),
Row(customer_name="Jurgen", date="2016-06-05", category="D", product_name="Recycle bin", quantity=5, price=27.00),
])
result = amount_spent_udf(data=customers)
result.show(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment