Last active
September 24, 2019 00:09
-
-
Save oneryalcin/5ddfa8834cfe00e90b940e77f2d8cb60 to your computer and use it in GitHub Desktop.
12 Pipeline stages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Keep a list for Pipeline stages | |
stages = [] | |
# Convert categorical variables to indexes | |
indexers = [StringIndexer(inputCol=column, outputCol=column+"_idx").fit(j) for column in ['level', 'gender']] | |
# Convert indexes to OnHotEncoded Sparse Vectors | |
onehotencoder = OneHotEncoderEstimator(inputCols=['gender_idx', 'level_idx'], | |
outputCols=['gender_dummy','level_dummy']) | |
# Assemble all predictor values to "nonScaledFatures" Dense vector | |
assembler = VectorAssembler(inputCols=['gender_dummy', 'level_dummy', 'logSessionCount', | |
'sqrtMeanSongCount', 'sqrtSessionsFreqDay'], | |
outputCol='nonScaledFeatures') | |
# Scale the features, depends on the classifier you can pick scaled or non scaled features | |
scalers = [ | |
MinMaxScaler(inputCol="nonScaledFeatures", outputCol="minMaxScaledFeatures"), | |
StandardScaler(inputCol="nonScaledFeatures", outputCol="stdScaledFeatures", withStd=True, withMean=True) | |
] | |
# Define RandomForestClassifer with features(predictors) and label | |
classifier = RandomForestClassifier(featuresCol='nonScaledFeatures', labelCol='churned') | |
# Add all to stages | |
stages += indexers | |
stages.append(onehotencoder) | |
stages.append(assembler) | |
stages += scalers | |
stages.append(classifier) | |
# Create a pipeline object defined by each stage | |
pipeline = Pipeline(stages=stages) | |
# Check the stage order | |
stages |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment