BF bfraiche

## bayes_w_r_and_sparklyr.R
mc$defaultLibrary <- "sparklyr"

library(sparklyr)
library(tidyverse)

speeches <- magpie::sql(mc, "SELECT * FROM presidential_speeches WHERE president")

partitions <- speeches %>%
    ft_tokenizer(input_col = 'speech_text', output_col = 'words') %>%
    ft_stop_words_remover(input_col = 'words', output_col = 'clean_words') %>%

## random_forest_with_python_and_spark_ml.py
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import matplotlib.pyplot as plt
import numpy as np

# Pull in the data
df = mc.sql("SELECT * FROM kings_county_housing")

## vec_asmbl.py
from pyspark.ml.feature import VectorAssembler

feature_list = []
for col in df.columns:
    if col == 'label':
        continue
    else:
        feature_list.append(col)

assembler = VectorAssembler(inputCols=feature_list, outputCol="features")

## train_model.py
cvModel = crossval.fit(trainingData)

## test_pred.py
predictions = cvModel.transform(testData)

## split_data.py
(trainingData, testData) = df.randomSplit([0.8, 0.2])

## importance.py
bestPipeline = cvModel.bestModel
bestModel = bestPipeline.stages[1]

importances = bestModel.featureImportances

x_values = list(range(len(importances)))

plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation=40)
plt.ylabel('Importance')

## get_df.py
df = mc.sql("SELECT * FROM kings_county_housing")

## evaluate.py
import matplotlib.pyplot as plt

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)

rfPred = model.transform(df)

rfResult = rfPred.toPandas()

## build_pl.py
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[assembler, rf])
	mc$defaultLibrary <- "sparklyr"

	library(sparklyr)
	library(tidyverse)

	speeches <- magpie::sql(mc, "SELECT * FROM presidential_speeches WHERE president")

	partitions <- speeches %>%
	ft_tokenizer(input_col = 'speech_text', output_col = 'words') %>%
	ft_stop_words_remover(input_col = 'words', output_col = 'clean_words') %>%
	from pyspark.ml import Pipeline
	from pyspark.ml.feature import VectorAssembler
	from pyspark.ml.regression import RandomForestRegressor
	from pyspark.ml.evaluation import RegressionEvaluator
	from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
	import matplotlib.pyplot as plt
	import numpy as np

	# Pull in the data
	df = mc.sql("SELECT * FROM kings_county_housing")
	from pyspark.ml.feature import VectorAssembler

	feature_list = []
	for col in df.columns:
	if col == 'label':
	continue
	else:
	feature_list.append(col)

	assembler = VectorAssembler(inputCols=feature_list, outputCol="features")
	bestPipeline = cvModel.bestModel
	bestModel = bestPipeline.stages[1]

	importances = bestModel.featureImportances

	x_values = list(range(len(importances)))

	plt.bar(x_values, importances, orientation = 'vertical')
	plt.xticks(x_values, feature_list, rotation=40)
	plt.ylabel('Importance')
	import matplotlib.pyplot as plt

	evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

	rmse = evaluator.evaluate(predictions)

	rfPred = model.transform(df)

	rfResult = rfPred.toPandas()
	from pyspark.ml import Pipeline

	pipeline = Pipeline(stages=[assembler, rf])