BF bfraiche

## add_rf.py
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(labelCol="label", featuresCol="features")

## best_hp.py
print('numTrees - ', bestModel.getNumTrees)
print('maxDepth - ', bestModel.getOrDefault('maxDepth'))

## build_cv.py
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)

## build_grid.py
from pyspark.ml.tuning import ParamGridBuilder
import numpy as np

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 10, stop = 50, num = 3)]) \
    .addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 5, stop = 25, num = 3)]) \
    .build()

## build_pl.py
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[assembler, rf])

## evaluate.py
import matplotlib.pyplot as plt

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)

rfPred = model.transform(df)

rfResult = rfPred.toPandas()

## get_df.py
df = mc.sql("SELECT * FROM kings_county_housing")

## importance.py
bestPipeline = cvModel.bestModel
bestModel = bestPipeline.stages[1]

importances = bestModel.featureImportances

x_values = list(range(len(importances)))

plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation=40)
plt.ylabel('Importance')

## split_data.py
(trainingData, testData) = df.randomSplit([0.8, 0.2])

## test_pred.py
predictions = cvModel.transform(testData)
	from pyspark.ml.regression import RandomForestRegressor

	rf = RandomForestRegressor(labelCol="label", featuresCol="features")
	print('numTrees - ', bestModel.getNumTrees)
	print('maxDepth - ', bestModel.getOrDefault('maxDepth'))
	from pyspark.ml.tuning import CrossValidator
	from pyspark.ml.evaluation import RegressionEvaluator

	crossval = CrossValidator(estimator=pipeline,
	estimatorParamMaps=paramGrid,
	evaluator=RegressionEvaluator(),
	numFolds=3)
	from pyspark.ml.tuning import ParamGridBuilder
	import numpy as np

	paramGrid = ParamGridBuilder() \
	.addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 10, stop = 50, num = 3)]) \
	.addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 5, stop = 25, num = 3)]) \
	.build()
	from pyspark.ml import Pipeline

	pipeline = Pipeline(stages=[assembler, rf])
	import matplotlib.pyplot as plt

	evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

	rmse = evaluator.evaluate(predictions)

	rfPred = model.transform(df)

	rfResult = rfPred.toPandas()
	bestPipeline = cvModel.bestModel
	bestModel = bestPipeline.stages[1]

	importances = bestModel.featureImportances

	x_values = list(range(len(importances)))

	plt.bar(x_values, importances, orientation = 'vertical')
	plt.xticks(x_values, feature_list, rotation=40)
	plt.ylabel('Importance')