Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Predict whether or not a tip was paid for a taxi trip using different MicrosoftML functions and compare them to find the best fit
library(MicrosoftML)
sqlConnString <- "Driver=SQL Server;Server=.;Database=nyctaxi;Trusted_Connection=True"
dataSetSource <- RxSqlServerData(connectionString = sqlConnString, table = "nyctaxi_sample", rowsPerRead = 2000000)
dataset <- rxImport(dataSetSource)
rxGetVarInfo(dataset)
head(dataset)
# Set the random seed for reproducibility of randomness.
set.seed(2345, "L'Ecuyer-CMRG")
# Randomly split the data 75-25 between train and test sets.
dataProb <- c(Train = 0.75, Test = 0.25)
dataSplit <-
rxSplit(dataset,
splitByFactor = "splitVar",
transforms = list(splitVar =
sample(dataFactor,
size = .rxNumRows,
replace = TRUE,
prob = dataProb)),
transformObjects =
list(dataProb = dataProb,
dataFactor = factor(names(dataProb),
levels = names(dataProb))),
outFilesBase = tempfile())
# Name the train and test datasets.
dataTrain <- dataSplit[[1]]
dataTest <- dataSplit[[2]]
rxSummary(~ tipped, dataTrain)$sDataFrame
rxSummary(~ tipped, dataTest)$sDataFrame
model <- formula(paste("tipped ~ passenger_count + trip_time_in_secs + trip_distance + total_amount"))
rxLogisticRegressionFit <- rxLogisticRegression(model, data = dataTrain)
rxFastLinearFit <- rxFastLinear(model, data = dataTrain)
rxFastTreesFit <- rxFastTrees(model, data = dataTrain)
rxFastForestFit <- rxFastForest(model, data = dataTrain)
rxNeuralNetFit <- rxNeuralNet(model, data = dataTrain)
fitScores <- rxPredict(rxLogisticRegressionFit, dataTest, suffix = ".rxLogisticRegression",
extraVarsToWrite = names(dataTest),
outData = tempfile(fileext = ".xdf"))
fitScores <- rxPredict(rxFastLinearFit, fitScores, suffix = ".rxFastLinear",
extraVarsToWrite = names(fitScores),
outData = tempfile(fileext = ".xdf"))
fitScores <- rxPredict(rxFastTreesFit, fitScores, suffix = ".rxFastTrees",
extraVarsToWrite = names(fitScores),
outData = tempfile(fileext = ".xdf"))
fitScores <- rxPredict(rxFastForestFit, fitScores, suffix = ".rxFastForest",
extraVarsToWrite = names(fitScores),
outData = tempfile(fileext = ".xdf"))
fitScores <- rxPredict(rxNeuralNetFit, fitScores, suffix = ".rxNeuralNet",
extraVarsToWrite = names(fitScores),
outData = tempfile(fileext = ".xdf"))
# Compute the fit models's ROC curves.
fitRoc <- rxRoc("tipped", grep("Probability.", names(fitScores), value = T), fitScores)
# Plot the ROC curves and report their AUCs.
plot(fitRoc)
# Create a named list of the fit models.
fitList <-
list(rxLogisticRegression = rxLogisticRegressionFit,
rxFastLinear = rxFastLinearFit,
rxFastTrees = rxFastTreesFit,
rxFastForest = rxFastForestFit,
rxNeuralNet = rxNeuralNetFit)
# Compute the fit models's AUCs.
fitAuc <- rxAuc(fitRoc)
names(fitAuc) <- substring(names(fitAuc), nchar("Probability.") + 1)
# Find the name of the fit with the largest AUC.
bestFitName <- names(which.max(fitAuc))
# Select the fit model with the largest AUC.
bestFit <- fitList[[bestFitName]]
# Report the fit AUCs.
cat("Fit model AUCs:\n")
print(fitAuc, digits = 2)
# Report the best fit.
cat(paste0("Best fit model with ", bestFitName,
", AUC = ", signif(fitAuc[[bestFitName]], digits = 2),
".\n"))
@saraswatmks

This comment has been minimized.

Copy link

saraswatmks commented Jan 26, 2017

I am trying to learn it better. Where can I find more sample scripts or documentation ?

@ramnov

This comment has been minimized.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.