Albert Franzi afranzi

## pyspark.sh
(mlflow) afranzi:~$ pyspark
[I 19:05:01.572 NotebookApp] sparkmagic extension enabled!
[I 19:05:01.573 NotebookApp] Serving notebooks from local directory: /Users/afranzi/Projects/notebooks
[I 19:05:01.573 NotebookApp] The Jupyter Notebook is running at:
[I 19:05:01.573 NotebookApp] http://localhost:8888/?token=c06252daa6a12cfdd33c1d2e96c8d3b19d90e9f6fc171745
[I 19:05:01.573 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
[C 19:05:01.574 NotebookApp]

    Copy/paste this URL into your browser when you connect for the first time,
    to login with a token:

## sparkWinePrediction.py
import mlflow.pyfunc

model_path = 's3://<bucket>/mlflow/artifacts/1/0f8691808e914d1087cf097a08730f17/artifacts/model'
wine_path = '/Users/afranzi/Projects/data/winequality-red.csv'
wine_udf = mlflow.pyfunc.spark_udf(spark, model_path)

df = spark.read.format("csv").option("header", "true").option('delimiter', ';').load(wine_path)
columns = [ "fixed acidity", "volatile acidity", "citric acid",
            "residual sugar", "chlorides", "free sulfur dioxide",
            "total sulfur dioxide", "density", "pH",

## SparkJupyter.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                afranzi
                / SparkJupyter.md
            
            
              Last active
              May 16, 2019 09:51
            
              
                Spark + Toree + Jupyter
              
          
    Install Spark + Toree + Jupyter
pip install toree
jupyter toree install --spark_home=${SPARK_HOME} --sys-prefix
jupyter kernelspec list

Available kernels:
  apache_toree_scala    /Users/afranzi/.virtualenvs/mlflow/share/jupyter/kernels/apache_toree_scala
 python3 /Users/afranzi/.virtualenvs/mlflow/share/jupyter/kernels/python3


## Wine Quality Prediction - Scala.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                afranzi
                / Wine Quality Prediction - Scala.ipynb
            
            
              Last active
              October 25, 2018 21:10
            
              
                MLflow UDFs from Scala Spark
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## WineQualityScalaSparkPrediction.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                afranzi
                / WineQualityScalaSparkPrediction.md
            
            
              Created
              October 29, 2018 10:45
            
              
                Wine Quality Prediction with Spark Scala and UDF
              
          
    import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Column, DataFrame}
import scala.util.matching.Regex

val FirstAtRe: Regex = "^_".r
val AliasRe: Regex = "[\\s_.:@]+".r

def getFieldAlias(field_name: String): String = {

  
## order-event.json
{
  "$id": "/schema/event/order",
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "user": { "$ref": "/schema/object/user" },
    "products": {
      "type": "array",
      "items": { "$ref": "/schema/object/product" }
    },

## 1.device-sensor-wifi-event.json
{
  "user": {
    "id": "5a34008f8cece4000764cd5a"
  },
  "device": {
    "id": "5a3400a48cece4000764d342",
    "platform": "Android"
  },
  "product": {
    "id": "remixprototype",

## PySparkUDF.py
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

# 1.- UDF with f as a lambda
to_upper = udf(lambda s: s.upper() if s else None, StringType())

# 2.- UDF with f as a method
def to_upper(s):
  if s is not None:
    return s.upper()

## test_udf_lambda.py
from unittest import TestCase

from our_package import to_upper


class TestUDFs(TestCase):

    def test_upper(self):
        """
        # Case 1 - Lambda

## udf_lambda_errors.py
# Error 1 - to_upper returns a Column instead of a str
self.assertEqual(to_upper('potato'), 'POTATO')
"""
Column<b'(<lambda>(potato) = POTATO)'>
ValueError: Cannot convert column into bool:
please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
"""

# Error 2 - Spark is expecting a column name <str> or <Column>.
to_upper(None)
	(mlflow) afranzi:~$ pyspark
	[I 19:05:01.572 NotebookApp] sparkmagic extension enabled!
	[I 19:05:01.573 NotebookApp] Serving notebooks from local directory: /Users/afranzi/Projects/notebooks
	[I 19:05:01.573 NotebookApp] The Jupyter Notebook is running at:
	[I 19:05:01.573 NotebookApp] http://localhost:8888/?token=c06252daa6a12cfdd33c1d2e96c8d3b19d90e9f6fc171745
	[I 19:05:01.573 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
	[C 19:05:01.574 NotebookApp]

	Copy/paste this URL into your browser when you connect for the first time,
	to login with a token:
	import mlflow.pyfunc

	model_path = 's3://<bucket>/mlflow/artifacts/1/0f8691808e914d1087cf097a08730f17/artifacts/model'
	wine_path = '/Users/afranzi/Projects/data/winequality-red.csv'
	wine_udf = mlflow.pyfunc.spark_udf(spark, model_path)

	df = spark.read.format("csv").option("header", "true").option('delimiter', ';').load(wine_path)
	columns = [ "fixed acidity", "volatile acidity", "citric acid",
	"residual sugar", "chlorides", "free sulfur dioxide",
	"total sulfur dioxide", "density", "pH",
	{
	"$id": "/schema/event/order",
	"$schema": "http://json-schema.org/draft-07/schema#",
	"type": "object",
	"properties": {
	"user": { "$ref": "/schema/object/user" },
	"products": {
	"type": "array",
	"items": { "$ref": "/schema/object/product" }
	},
	{
	"user": {
	"id": "5a34008f8cece4000764cd5a"
	},
	"device": {
	"id": "5a3400a48cece4000764d342",
	"platform": "Android"
	},
	"product": {
	"id": "remixprototype",
	from pyspark.sql.types import StringType
	from pyspark.sql.functions import udf

	# 1.- UDF with f as a lambda
	to_upper = udf(lambda s: s.upper() if s else None, StringType())

	# 2.- UDF with f as a method
	def to_upper(s):
	if s is not None:
	return s.upper()
	from unittest import TestCase

	from our_package import to_upper


	class TestUDFs(TestCase):

	def test_upper(self):
	"""
	# Case 1 - Lambda
	# Error 1 - to_upper returns a Column instead of a str
	self.assertEqual(to_upper('potato'), 'POTATO')
	"""
	Column<b'(<lambda>(potato) = POTATO)'>
	ValueError: Cannot convert column into bool:
	please use '&' for 'and', '\|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
	"""

	# Error 2 - Spark is expecting a column name <str> or <Column>.
	to_upper(None)