lakshay-arora/pipeline_4_pyspark.py

## pipeline_4_pyspark.py
# define stage 1: transform the column feature_2 to numeric
stage_1 = StringIndexer(inputCol= 'feature_2', outputCol= 'feature_2_index')
# define stage 2: transform the column feature_3 to numeric
stage_2 = StringIndexer(inputCol= 'feature_3', outputCol= 'feature_3_index')
# define stage 3: one hot encode the numeric versions of feature 2 and 3 generated from stage 1 and stage 2
stage_3 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), stage_2.getOutputCol()],
                                 outputCols= ['feature_2_encoded', 'feature_3_encoded'])
# define stage 4: create a vector of all the features required to train the logistic regression model
stage_4 = VectorAssembler(inputCols=['feature_1', 'feature_2_encoded', 'feature_3_encoded', 'feature_4'],
                          outputCol='features')
# define stage 5: logistic regression model
stage_5 = LogisticRegression(featuresCol='features',labelCol='label')

# setup the pipeline
regression_pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, stage_4, stage_5])

# fit the pipeline for the trainind data
model = regression_pipeline.fit(sample_data_train)
# transform the data
sample_data_train = model.transform(sample_data_train)

# view some of the columns generated
sample_data_train.select('features', 'label', 'rawPrediction', 'probability', 'prediction').show()
	# define stage 1: transform the column feature_2 to numeric
	stage_1 = StringIndexer(inputCol= 'feature_2', outputCol= 'feature_2_index')
	# define stage 2: transform the column feature_3 to numeric
	stage_2 = StringIndexer(inputCol= 'feature_3', outputCol= 'feature_3_index')
	# define stage 3: one hot encode the numeric versions of feature 2 and 3 generated from stage 1 and stage 2
	stage_3 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), stage_2.getOutputCol()],
	outputCols= ['feature_2_encoded', 'feature_3_encoded'])
	# define stage 4: create a vector of all the features required to train the logistic regression model
	stage_4 = VectorAssembler(inputCols=['feature_1', 'feature_2_encoded', 'feature_3_encoded', 'feature_4'],
	outputCol='features')
	# define stage 5: logistic regression model
	stage_5 = LogisticRegression(featuresCol='features',labelCol='label')

	# setup the pipeline
	regression_pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, stage_4, stage_5])

	# fit the pipeline for the trainind data
	model = regression_pipeline.fit(sample_data_train)
	# transform the data
	sample_data_train = model.transform(sample_data_train)

	# view some of the columns generated
	sample_data_train.select('features', 'label', 'rawPrediction', 'probability', 'prediction').show()