ramnov/MicrosoftML_NYC_Taxi_Tip_Prediction.R

## MicrosoftML_NYC_Taxi_Tip_Prediction.R
library(MicrosoftML)
sqlConnString <- "Driver=SQL Server;Server=.;Database=nyctaxi;Trusted_Connection=True"
dataSetSource <- RxSqlServerData(connectionString = sqlConnString, table = "nyctaxi_sample", rowsPerRead = 2000000)
dataset <- rxImport(dataSetSource)
rxGetVarInfo(dataset)
head(dataset)
# Set the random seed for reproducibility of randomness.
set.seed(2345, "L'Ecuyer-CMRG")
# Randomly split the data 75-25 between train and test sets.
dataProb <- c(Train = 0.75, Test = 0.25)
dataSplit <-
    rxSplit(dataset,
            splitByFactor = "splitVar",
            transforms = list(splitVar =
                                sample(dataFactor,
                                       size = .rxNumRows,
                                       replace = TRUE,
                                       prob = dataProb)),
            transformObjects =
                list(dataProb = dataProb,
                     dataFactor = factor(names(dataProb),
                                         levels = names(dataProb))),
            outFilesBase = tempfile())

# Name the train and test datasets.
dataTrain <- dataSplit[[1]]
dataTest <- dataSplit[[2]]
rxSummary(~ tipped, dataTrain)$sDataFrame
rxSummary(~ tipped, dataTest)$sDataFrame
model <- formula(paste("tipped ~ passenger_count + trip_time_in_secs + trip_distance + total_amount"))
rxLogisticRegressionFit <- rxLogisticRegression(model, data = dataTrain)
rxFastLinearFit <- rxFastLinear(model, data = dataTrain)
rxFastTreesFit <- rxFastTrees(model, data = dataTrain)
rxFastForestFit <- rxFastForest(model, data = dataTrain)
rxNeuralNetFit <- rxNeuralNet(model, data = dataTrain)
fitScores <- rxPredict(rxLogisticRegressionFit, dataTest, suffix = ".rxLogisticRegression",
                       extraVarsToWrite = names(dataTest),
                       outData = tempfile(fileext = ".xdf"))
fitScores <- rxPredict(rxFastLinearFit, fitScores, suffix = ".rxFastLinear",
                       extraVarsToWrite = names(fitScores),
                       outData = tempfile(fileext = ".xdf"))
fitScores <- rxPredict(rxFastTreesFit, fitScores, suffix = ".rxFastTrees",
                       extraVarsToWrite = names(fitScores),
                       outData = tempfile(fileext = ".xdf"))
fitScores <- rxPredict(rxFastForestFit, fitScores, suffix = ".rxFastForest",
                       extraVarsToWrite = names(fitScores),
                       outData = tempfile(fileext = ".xdf"))
fitScores <- rxPredict(rxNeuralNetFit, fitScores, suffix = ".rxNeuralNet",
                       extraVarsToWrite = names(fitScores),
                       outData = tempfile(fileext = ".xdf"))

# Compute the fit models's ROC curves.
fitRoc <- rxRoc("tipped", grep("Probability.", names(fitScores), value = T), fitScores)
# Plot the ROC curves and report their AUCs.
plot(fitRoc)

# Create a named list of the fit models.
fitList <-
    list(rxLogisticRegression = rxLogisticRegressionFit,
         rxFastLinear = rxFastLinearFit,
         rxFastTrees = rxFastTreesFit,
         rxFastForest = rxFastForestFit,
         rxNeuralNet = rxNeuralNetFit)

# Compute the fit models's AUCs.
fitAuc <- rxAuc(fitRoc)
names(fitAuc) <- substring(names(fitAuc), nchar("Probability.") + 1)

# Find the name of the fit with the largest AUC.
bestFitName <- names(which.max(fitAuc))

# Select the fit model with the largest AUC.
bestFit <- fitList[[bestFitName]]

# Report the fit AUCs.
cat("Fit model AUCs:\n")
print(fitAuc, digits = 2)

# Report the best fit.
cat(paste0("Best fit model with ", bestFitName,
           ", AUC = ", signif(fitAuc[[bestFitName]], digits = 2),
           ".\n"))
	library(MicrosoftML)
	sqlConnString <- "Driver=SQL Server;Server=.;Database=nyctaxi;Trusted_Connection=True"
	dataSetSource <- RxSqlServerData(connectionString = sqlConnString, table = "nyctaxi_sample", rowsPerRead = 2000000)
	dataset <- rxImport(dataSetSource)
	rxGetVarInfo(dataset)
	head(dataset)
	# Set the random seed for reproducibility of randomness.
	set.seed(2345, "L'Ecuyer-CMRG")
	# Randomly split the data 75-25 between train and test sets.
	dataProb <- c(Train = 0.75, Test = 0.25)
	dataSplit <-
	rxSplit(dataset,
	splitByFactor = "splitVar",
	transforms = list(splitVar =
	sample(dataFactor,
	size = .rxNumRows,
	replace = TRUE,
	prob = dataProb)),
	transformObjects =
	list(dataProb = dataProb,
	dataFactor = factor(names(dataProb),
	levels = names(dataProb))),
	outFilesBase = tempfile())

	# Name the train and test datasets.
	dataTrain <- dataSplit[[1]]
	dataTest <- dataSplit[[2]]
	rxSummary(~ tipped, dataTrain)$sDataFrame
	rxSummary(~ tipped, dataTest)$sDataFrame
	model <- formula(paste("tipped ~ passenger_count + trip_time_in_secs + trip_distance + total_amount"))
	rxLogisticRegressionFit <- rxLogisticRegression(model, data = dataTrain)
	rxFastLinearFit <- rxFastLinear(model, data = dataTrain)
	rxFastTreesFit <- rxFastTrees(model, data = dataTrain)
	rxFastForestFit <- rxFastForest(model, data = dataTrain)
	rxNeuralNetFit <- rxNeuralNet(model, data = dataTrain)
	fitScores <- rxPredict(rxLogisticRegressionFit, dataTest, suffix = ".rxLogisticRegression",
	extraVarsToWrite = names(dataTest),
	outData = tempfile(fileext = ".xdf"))
	fitScores <- rxPredict(rxFastLinearFit, fitScores, suffix = ".rxFastLinear",
	extraVarsToWrite = names(fitScores),
	outData = tempfile(fileext = ".xdf"))
	fitScores <- rxPredict(rxFastTreesFit, fitScores, suffix = ".rxFastTrees",
	extraVarsToWrite = names(fitScores),
	outData = tempfile(fileext = ".xdf"))
	fitScores <- rxPredict(rxFastForestFit, fitScores, suffix = ".rxFastForest",
	extraVarsToWrite = names(fitScores),
	outData = tempfile(fileext = ".xdf"))
	fitScores <- rxPredict(rxNeuralNetFit, fitScores, suffix = ".rxNeuralNet",
	extraVarsToWrite = names(fitScores),
	outData = tempfile(fileext = ".xdf"))

	# Compute the fit models's ROC curves.
	fitRoc <- rxRoc("tipped", grep("Probability.", names(fitScores), value = T), fitScores)
	# Plot the ROC curves and report their AUCs.
	plot(fitRoc)

	# Create a named list of the fit models.
	fitList <-
	list(rxLogisticRegression = rxLogisticRegressionFit,
	rxFastLinear = rxFastLinearFit,
	rxFastTrees = rxFastTreesFit,
	rxFastForest = rxFastForestFit,
	rxNeuralNet = rxNeuralNetFit)

	# Compute the fit models's AUCs.
	fitAuc <- rxAuc(fitRoc)
	names(fitAuc) <- substring(names(fitAuc), nchar("Probability.") + 1)

	# Find the name of the fit with the largest AUC.
	bestFitName <- names(which.max(fitAuc))

	# Select the fit model with the largest AUC.
	bestFit <- fitList[[bestFitName]]

	# Report the fit AUCs.
	cat("Fit model AUCs:\n")
	print(fitAuc, digits = 2)

	# Report the best fit.
	cat(paste0("Best fit model with ", bestFitName,
	", AUC = ", signif(fitAuc[[bestFitName]], digits = 2),
	".\n"))