Predict whether or not a tip was paid for a taxi trip using different MicrosoftML functions and compare them to find the best fit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(MicrosoftML) | |
sqlConnString <- "Driver=SQL Server;Server=.;Database=nyctaxi;Trusted_Connection=True" | |
dataSetSource <- RxSqlServerData(connectionString = sqlConnString, table = "nyctaxi_sample", rowsPerRead = 2000000) | |
dataset <- rxImport(dataSetSource) | |
rxGetVarInfo(dataset) | |
head(dataset) | |
# Set the random seed for reproducibility of randomness. | |
set.seed(2345, "L'Ecuyer-CMRG") | |
# Randomly split the data 75-25 between train and test sets. | |
dataProb <- c(Train = 0.75, Test = 0.25) | |
dataSplit <- | |
rxSplit(dataset, | |
splitByFactor = "splitVar", | |
transforms = list(splitVar = | |
sample(dataFactor, | |
size = .rxNumRows, | |
replace = TRUE, | |
prob = dataProb)), | |
transformObjects = | |
list(dataProb = dataProb, | |
dataFactor = factor(names(dataProb), | |
levels = names(dataProb))), | |
outFilesBase = tempfile()) | |
# Name the train and test datasets. | |
dataTrain <- dataSplit[[1]] | |
dataTest <- dataSplit[[2]] | |
rxSummary(~ tipped, dataTrain)$sDataFrame | |
rxSummary(~ tipped, dataTest)$sDataFrame | |
model <- formula(paste("tipped ~ passenger_count + trip_time_in_secs + trip_distance + total_amount")) | |
rxLogisticRegressionFit <- rxLogisticRegression(model, data = dataTrain) | |
rxFastLinearFit <- rxFastLinear(model, data = dataTrain) | |
rxFastTreesFit <- rxFastTrees(model, data = dataTrain) | |
rxFastForestFit <- rxFastForest(model, data = dataTrain) | |
rxNeuralNetFit <- rxNeuralNet(model, data = dataTrain) | |
fitScores <- rxPredict(rxLogisticRegressionFit, dataTest, suffix = ".rxLogisticRegression", | |
extraVarsToWrite = names(dataTest), | |
outData = tempfile(fileext = ".xdf")) | |
fitScores <- rxPredict(rxFastLinearFit, fitScores, suffix = ".rxFastLinear", | |
extraVarsToWrite = names(fitScores), | |
outData = tempfile(fileext = ".xdf")) | |
fitScores <- rxPredict(rxFastTreesFit, fitScores, suffix = ".rxFastTrees", | |
extraVarsToWrite = names(fitScores), | |
outData = tempfile(fileext = ".xdf")) | |
fitScores <- rxPredict(rxFastForestFit, fitScores, suffix = ".rxFastForest", | |
extraVarsToWrite = names(fitScores), | |
outData = tempfile(fileext = ".xdf")) | |
fitScores <- rxPredict(rxNeuralNetFit, fitScores, suffix = ".rxNeuralNet", | |
extraVarsToWrite = names(fitScores), | |
outData = tempfile(fileext = ".xdf")) | |
# Compute the fit models's ROC curves. | |
fitRoc <- rxRoc("tipped", grep("Probability.", names(fitScores), value = T), fitScores) | |
# Plot the ROC curves and report their AUCs. | |
plot(fitRoc) | |
# Create a named list of the fit models. | |
fitList <- | |
list(rxLogisticRegression = rxLogisticRegressionFit, | |
rxFastLinear = rxFastLinearFit, | |
rxFastTrees = rxFastTreesFit, | |
rxFastForest = rxFastForestFit, | |
rxNeuralNet = rxNeuralNetFit) | |
# Compute the fit models's AUCs. | |
fitAuc <- rxAuc(fitRoc) | |
names(fitAuc) <- substring(names(fitAuc), nchar("Probability.") + 1) | |
# Find the name of the fit with the largest AUC. | |
bestFitName <- names(which.max(fitAuc)) | |
# Select the fit model with the largest AUC. | |
bestFit <- fitList[[bestFitName]] | |
# Report the fit AUCs. | |
cat("Fit model AUCs:\n") | |
print(fitAuc, digits = 2) | |
# Report the best fit. | |
cat(paste0("Best fit model with ", bestFitName, | |
", AUC = ", signif(fitAuc[[bestFitName]], digits = 2), | |
".\n")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I am trying to learn it better. Where can I find more sample scripts or documentation ?