Last active
September 23, 2022 16:41
-
-
Save colbyford/47ce6a72162c74c073b44532cd8be78c to your computer and use it in GitHub Desktop.
Score data using a transformation pipeline and trained SparkML model.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################## | |
## Title: Spark MLlib Model Scorer | |
## Language: PySpark | |
## Author: Colby T. Ford, Ph.D. | |
######################################## | |
from pyspark.ml.tuning import CrossValidatorModel | |
from pyspark.ml import PipelineModel | |
from pyspark.sql.functions import col, round | |
from pyspark.sql.types import IntegerType, FloatType | |
## Load the transformation pipeline | |
pipeline = PipelineModel.load("/mnt/trainedmodels/pipeline/") | |
## Fit the pipeline to new data | |
transformeddataset = pipeline.transform(dataset) | |
## Load the trained model | |
model = CrossValidatorModel.load("/mnt/trainedmodels/dt/") | |
## Score the data using the model | |
stamp = model.bestModel.transform(transformeddataset) | |
## Function to extract probability from array | |
getprob = udf(lambda v:float(v[1]),FloatType()) | |
## Select out the necessary columns | |
output = stamp.select(col("ID"), | |
col("label"), | |
col("rawPrediction"), | |
getprob(col("probability")).alias("probability"), | |
col("prediction")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment