Shaheen shaheeng

## pyspark_dataprofile
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import isnan, when, count, col

def dataprofile(data_all_df,data_cols):
    data_df = data_all_df.select(data_cols)
    columns2Bprofiled = data_df.columns
    global schema_name, table_name
    if not 'schema_name' in globals():
        schema_name = 'schema_name'

## predictwinequality_3labels_MSR.R
# Purpose: Build classification models to predict wine quality
#          Use three different classification algorithms and compare their accuracies
# Author : Shaheen Gauher - Data Scientist at Microsoft

# Note: The code below requires MRS (Microsoft R Server, formally Revolution R Enterprise (RRE))
# http://blog.revolutionanalytics.com/2016/01/microsoft-r-open.html
# MRS can be downloaded from https://www.dreamspark.com/Product/Product.aspx?productid=105

##download data from
#https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv

## predictwinequality_MRSpart5.R
################################################################################
## Decision forest modeling
################################################################################
#Decision Forest
#using rxDForest() to build ML model
DForest_model <- rxDForest(formula = formula,
                           data = trainingdata,
                           seed = 10,
                           cp = 0.01,
                           nTree = 50,

## predictwinequality_MRSpart6.R
#=======================================================
# Compute the accuracy of the trained models and how it performs on the test data
#=======================================================

#Function to compute accuracy of the trained model on the given data
computeaccuracy <- function(ML_model,scoredata){
  if(file.exists("modelout_xdf.xdf") ) {  file.remove("modelout_xdf.xdf") }
  modelout_xdf = RxXdfData("modelout_xdf.xdf") #initialise xdf object
  rxPredict(ML_model, data = scoredata, outData = modelout_xdf, overwrite = TRUE,
            writeModelVars = TRUE, reportProgress = 0)

## predictwinequality_MRSpart4.R
#collect names of columns (features) to be used for modelling
allfeatures = setdiff(names(data_classi),c('LabelsCol','splitcol'))

#create formula for modelling
formula = as.formula(paste('LabelsCol',paste(allfeatures,collapse=' + '),sep=' ~ '))
formula

## predictwinequality_MRSpart3.R
#use rxDataStep() to create a col called 'splitcol' to use for splitting
rxDataStep(inData=data_classi,outFile=data_classi,transforms=list(splitcol=factor(rbinom(.rxNumRows,1,0.8),labels=c('test','train'))),overwrite=T)

#split using the col "splitcol"
#rxSplit() -- Splits an input '.xdf' file or data frame into multiple '.xdf' files or a list of data frames.
listofxdfs = rxSplit(data_classi,outFileBase='data_classi_split',outFileSuffixes=c("Train", "Test"),splitByFactor = "splitcol",overwrite=T )

trainingdata = listofxdfs[[2]]
testdata     = listofxdfs[[1]]

## predictwinequality_MRSpart2.R
#make a new column factorQuality from the column quality -- make the label column categorical
rxFactors(inData = data_classi, outFile = data_classi, overwrite = TRUE,
          factorInfo = list(factorQuality = list(varName = "quality")),reportProgress=0)

#can remove the column quality now
ColsToKeep = setdiff(names(data_classi),c('quality'))

data_classi = rxDataStep(inData = data_classi, outFile = 'data_classi_temp.xdf',varsToKeep = ColsToKeep, overwrite = TRUE)

#rename the label col 'factorQuality' as 'LabelsCol'

## predictwinequality_MRSpart1.R
##download data from
#https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
data_wine = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
                       header=T,sep=";",na.strings="NA") #1599 12
class(data_wine)  #"data.frame"

#Relabel quality ratings as follows
#3,4,5 as Low
#6     as Med
#7,8   as High

## MRScode_baselinemetrics_censusincomedata.r
# Purpose: Evaluate your classification model against some baseline metrics
# Establish that your model is not just making lucky guesses but is significantly better than a random model
# Author : Shaheen Gauher - Data Scientist at Microsoft

# Note: The code below requires MRS (Microsoft R Server, formally Revolution R Enterprise (RRE))
# http://blog.revolutionanalytics.com/2016/01/microsoft-r-open.html
# MRS can be downloaded from https://www.dreamspark.com/Product/Product.aspx?productid=105

#download data from
#https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
	import pandas as pd
	from pyspark.sql import functions as F
	from pyspark.sql.functions import isnan, when, count, col

	def dataprofile(data_all_df,data_cols):
	data_df = data_all_df.select(data_cols)
	columns2Bprofiled = data_df.columns
	global schema_name, table_name
	if not 'schema_name' in globals():
	schema_name = 'schema_name'
	# Purpose: Build classification models to predict wine quality
	# Use three different classification algorithms and compare their accuracies
	# Author : Shaheen Gauher - Data Scientist at Microsoft

	# Note: The code below requires MRS (Microsoft R Server, formally Revolution R Enterprise (RRE))
	# http://blog.revolutionanalytics.com/2016/01/microsoft-r-open.html
	# MRS can be downloaded from https://www.dreamspark.com/Product/Product.aspx?productid=105

	##download data from
	#https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
	################################################################################
	## Decision forest modeling
	################################################################################
	#Decision Forest
	#using rxDForest() to build ML model
	DForest_model <- rxDForest(formula = formula,
	data = trainingdata,
	seed = 10,
	cp = 0.01,
	nTree = 50,
	#=======================================================
	# Compute the accuracy of the trained models and how it performs on the test data
	#=======================================================

	#Function to compute accuracy of the trained model on the given data
	computeaccuracy <- function(ML_model,scoredata){
	if(file.exists("modelout_xdf.xdf") ) { file.remove("modelout_xdf.xdf") }
	modelout_xdf = RxXdfData("modelout_xdf.xdf") #initialise xdf object
	rxPredict(ML_model, data = scoredata, outData = modelout_xdf, overwrite = TRUE,
	writeModelVars = TRUE, reportProgress = 0)
	#collect names of columns (features) to be used for modelling
	allfeatures = setdiff(names(data_classi),c('LabelsCol','splitcol'))

	#create formula for modelling
	formula = as.formula(paste('LabelsCol',paste(allfeatures,collapse=' + '),sep=' ~ '))
	formula
	#use rxDataStep() to create a col called 'splitcol' to use for splitting
	rxDataStep(inData=data_classi,outFile=data_classi,transforms=list(splitcol=factor(rbinom(.rxNumRows,1,0.8),labels=c('test','train'))),overwrite=T)

	#split using the col "splitcol"
	#rxSplit() -- Splits an input '.xdf' file or data frame into multiple '.xdf' files or a list of data frames.
	listofxdfs = rxSplit(data_classi,outFileBase='data_classi_split',outFileSuffixes=c("Train", "Test"),splitByFactor = "splitcol",overwrite=T )

	trainingdata = listofxdfs[[2]]
	testdata = listofxdfs[[1]]
	#make a new column factorQuality from the column quality -- make the label column categorical
	rxFactors(inData = data_classi, outFile = data_classi, overwrite = TRUE,
	factorInfo = list(factorQuality = list(varName = "quality")),reportProgress=0)

	#can remove the column quality now
	ColsToKeep = setdiff(names(data_classi),c('quality'))

	data_classi = rxDataStep(inData = data_classi, outFile = 'data_classi_temp.xdf',varsToKeep = ColsToKeep, overwrite = TRUE)

	#rename the label col 'factorQuality' as 'LabelsCol'
	##download data from
	#https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
	data_wine = read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
	header=T,sep=";",na.strings="NA") #1599 12
	class(data_wine) #"data.frame"

	#Relabel quality ratings as follows
	#3,4,5 as Low
	#6 as Med
	#7,8 as High
	# Purpose: Evaluate your classification model against some baseline metrics
	# Establish that your model is not just making lucky guesses but is significantly better than a random model
	# Author : Shaheen Gauher - Data Scientist at Microsoft

	# Note: The code below requires MRS (Microsoft R Server, formally Revolution R Enterprise (RRE))
	# http://blog.revolutionanalytics.com/2016/01/microsoft-r-open.html
	# MRS can be downloaded from https://www.dreamspark.com/Product/Product.aspx?productid=105

	#download data from
	#https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data