colbyford/SparkML_Scorer.py

## SparkML_Scorer.py
########################################
## Title: Spark MLlib Model Scorer
## Language: PySpark
## Author: Colby T. Ford, Ph.D.
########################################

from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml import PipelineModel
from pyspark.sql.functions import col, round
from pyspark.sql.types import IntegerType, FloatType

## Load the transformation pipeline
pipeline = PipelineModel.load("/mnt/trainedmodels/pipeline/")
## Fit the pipeline to new data
transformeddataset = pipeline.transform(dataset)

## Load the trained model
model = CrossValidatorModel.load("/mnt/trainedmodels/dt/")
## Score the data using the model
stamp = model.bestModel.transform(transformeddataset)

## Function to extract probability from array
getprob = udf(lambda v:float(v[1]),FloatType())

## Select out the necessary columns
output = stamp.select(col("ID"),
                      col("label"),
                      col("rawPrediction"),
                      getprob(col("probability")).alias("probability"),
                      col("prediction"))
	########################################
	## Title: Spark MLlib Model Scorer
	## Language: PySpark
	## Author: Colby T. Ford, Ph.D.
	########################################

	from pyspark.ml.tuning import CrossValidatorModel
	from pyspark.ml import PipelineModel
	from pyspark.sql.functions import col, round
	from pyspark.sql.types import IntegerType, FloatType

	## Load the transformation pipeline
	pipeline = PipelineModel.load("/mnt/trainedmodels/pipeline/")
	## Fit the pipeline to new data
	transformeddataset = pipeline.transform(dataset)

	## Load the trained model
	model = CrossValidatorModel.load("/mnt/trainedmodels/dt/")
	## Score the data using the model
	stamp = model.bestModel.transform(transformeddataset)

	## Function to extract probability from array
	getprob = udf(lambda v:float(v[1]),FloatType())

	## Select out the necessary columns
	output = stamp.select(col("ID"),
	col("label"),
	col("rawPrediction"),
	getprob(col("probability")).alias("probability"),
	col("prediction"))