Skip to content

Instantly share code, notes, and snippets.

from unittest import TestCase
def dummy_udf(f):
return f
def mock_udf(f=None, returnType=None):
return f if f else dummy_udf
from mock import patch
patch('pyspark.sql.functions.udf', mock_udf).start()
from unittest import TestCase
import pytest
from pyspark.sql.types import StringType
@pytest.fixture(scope='function', autouse=True)
def mock_udf_annotation(monkeypatch):
def dummy_udf(f):
return f
@udf(returnType=ArrayType(StringType()))
def to_upper_list(s):
return [i.upper() for i in s]
# Case 1 - UDF annotation
to_upper_list(['potato', 'carrot', 'tomato'])
"""
TypeError: Invalid argument, not a string or column: ['potato', 'carrot', 'tomato'] of type <class 'list'>.
For column literals, use 'lit', 'array', 'struct' or 'create_map' function
# Error 1 - to_upper returns a Column instead of a str
self.assertEqual(to_upper('potato'), 'POTATO')
"""
Column<b'(<lambda>(potato) = POTATO)'>
ValueError: Cannot convert column into bool:
please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
"""
# Error 2 - Spark is expecting a column name <str> or <Column>.
to_upper(None)
from unittest import TestCase
from our_package import to_upper
class TestUDFs(TestCase):
def test_upper(self):
"""
# Case 1 - Lambda
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
# 1.- UDF with f as a lambda
to_upper = udf(lambda s: s.upper() if s else None, StringType())
# 2.- UDF with f as a method
def to_upper(s):
if s is not None:
return s.upper()
@afranzi
afranzi / 1.device-sensor-wifi-event.json
Created January 9, 2019 15:39
Schema Validation Path Traceability
{
"user": {
"id": "5a34008f8cece4000764cd5a"
},
"device": {
"id": "5a3400a48cece4000764d342",
"platform": "Android"
},
"product": {
"id": "remixprototype",
@afranzi
afranzi / order-event.json
Created December 4, 2018 22:49
Order event Schema
{
"$id": "/schema/event/order",
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"properties": {
"user": { "$ref": "/schema/object/user" },
"products": {
"type": "array",
"items": { "$ref": "/schema/object/product" }
},
@afranzi
afranzi / WineQualityScalaSparkPrediction.md
Created October 29, 2018 10:45
Wine Quality Prediction with Spark Scala and UDF
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Column, DataFrame}
import scala.util.matching.Regex

val FirstAtRe: Regex = "^_".r
val AliasRe: Regex = "[\\s_.:@]+".r

def getFieldAlias(field_name: String): String = {
@afranzi
afranzi / Wine Quality Prediction - Scala.ipynb
Last active October 25, 2018 21:10
MLflow UDFs from Scala Spark
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.