lakshay-arora/pipeline_5_pyspark.py

## pipeline_5_pyspark.py
# create a sample data without the labels
sample_data_test = spark.createDataFrame([
    (3.0, 'Z', 'S10', 40),
    (1.0, 'X', 'E10', 20),
    (4.0, 'A', 'S20', 10),
    (3.0, 'A', 'S10', 20),
    (4.0, 'X', 'D10', 30),
    (1.0, 'Z', 'E10', 20),
    (4.0, 'A', 'S10', 30),
], ['feature_1', 'feature_2', 'feature_3', 'feature_4'])

# transform the data using the pipeline
sample_data_test = model.transform(sample_data_test)

# see the prediction on the test data
sample_data_test.select('features', 'rawPrediction', 'probability', 'prediction').show()
	# create a sample data without the labels
	sample_data_test = spark.createDataFrame([
	(3.0, 'Z', 'S10', 40),
	(1.0, 'X', 'E10', 20),
	(4.0, 'A', 'S20', 10),
	(3.0, 'A', 'S10', 20),
	(4.0, 'X', 'D10', 30),
	(1.0, 'Z', 'E10', 20),
	(4.0, 'A', 'S10', 30),
	], ['feature_1', 'feature_2', 'feature_3', 'feature_4'])

	# transform the data using the pipeline
	sample_data_test = model.transform(sample_data_test)

	# see the prediction on the test data
	sample_data_test.select('features', 'rawPrediction', 'probability', 'prediction').show()