Skip to content

Instantly share code, notes, and snippets.

@jknowles
Created September 16, 2018 17:24
Show Gist options
  • Save jknowles/26217964de2d2f145be50b701382b024 to your computer and use it in GitHub Desktop.
Save jknowles/26217964de2d2f145be50b701382b024 to your computer and use it in GitHub Desktop.
###############################################################################
## SDP Fall Workshop Predictive Analytics
## Advanced / Additional Code Snippets for Working with PA Data and Models
## Author: Jared E. Knowles
## Date: 09/14/2018
## You do not need to use all or even any of this code. The code does not need to
## be run together. This is just a survey of some additional techniques/tricks you
## can do in R to make explaining predictive models and complex data easier.
## As always - your needs and approaches may different.
################################################################################
# Load the data
load("data/montucky.rda")
########################################################################
# Build and plot a regression tree
########################################################################
library(rpart.plot)
library(rpart)
# Basic partition tree model to plot
binary.model <- rpart(ontime_grad ~ scale_score_7_math +
pct_days_absent_7 + ell_7 + iep_7 + frpl_7 + male +
race_ethnicity +
sch_g7_lep_per + sch_g7_gifted_per,
data = sea_data,
control = rpart.control(cp = .005))
# Pass the model object to the plot function
# You can look at ?rpart.plot to identify customizations you wish to use
rpart.plot(binary.model)
# To improve the labels, you can rename the variables in the data
sea_data$short_race_label <- NA
sea_data$short_race_label[sea_data$race_ethnicity == "Hispan..."] <- "H"
sea_data$short_race_label[sea_data$race_ethnicity == "White"] <- "W"
sea_data$short_race_label[sea_data$race_ethnicity == "Asian"] <- "A"
sea_data$short_race_label[sea_data$race_ethnicity == "Americ..."] <- "AI"
sea_data$short_race_label[sea_data$race_ethnicity == "Demogg..."] <- "M"
sea_data$short_race_label[sea_data$race_ethnicity == "Native..."] <- "HA"
sea_data$short_race_label[sea_data$race_ethnicity == "Black ..."] <- "B"
# Fit a classification model, just like a logistic regression formula interface
binary.model <- rpart(ontime_grad ~ scale_score_7_math +
pct_days_absent_7 + ell_7 + iep_7 + frpl_7 + male +
short_race_label +
sch_g7_lep_per + sch_g7_gifted_per,
data = sea_data,
control = rpart.control(cp = .005))
# Make a plot showing the tree
rpart.plot(binary.model)
########################################################################
## Variable importance from a caret/train model object
########################################################################
library(caret)
## Fit a simple model just to show the syntax, you should be fitting a model
## split on train and test data, substitute your own model here.
# Unfortunately, we have to remove NAs first, because
# caret won't do it for us
train_data <- na.omit(sea_data[, c("ontime_grad", "scale_score_7_math",
"pct_days_absent_7", "ell_7", "iep_7",
"male", "frpl_7", "race_ethnicity",
"sch_g7_lep_per", "sch_g7_gifted_per")])
# Outcome must be a factor in R to work with caret, so we recode
# Avoid 0/1 factor labels - this causes problems in fitting many model types.
train_data$ontime_grad <- ifelse(train_data$ontime_grad == 1, "Grad", "Nongrad")
train_data$ontime_grad <- factor(train_data$ontime_grad)
# Fit the model
caret_mod <- train(ontime_grad ~ .,
method = "rpart",
data = train_data,
metric = "AUC",
trControl = trainControl(summaryFunction = prSummary,
classProbs = TRUE,
method = "cv"))
# Get variable importance, not available for all methods
caret::varImp(caret_mod)
# Plot variable importance
plot(caret::varImp(caret_mod))
########################################################################
## Probablity accuracy
## Plot the relationship between predicted probabilities and observed
## graduation rates to explore thresholds.
########################################################################
library(yardstick)
library(dplyr)
# Get the predicted classifications from the caret model above
class_vec <- predict(caret_mod, newdata = train_data)
# Get the predicted probabilities from the model above
## Note that caret insists on giving probabilities for both classes, we
## need to store only one, in this case, the first one
prob_vec <- predict(caret_mod, newdata = train_data, type = "prob")[[1]] # only need first column
# Combine the true values, the estimated class, and the class probability
# into one dataframe for plotting
estimates_tbl <- data.frame(
truth = as.factor(train_data$ontime_grad),
estimate = as.factor(class_vec),
class_prob = prob_vec
)
# Using this struture we can use the `yardstick` package to nicely
# compute a variety of performanc emetrics
## Confusion matrix
estimates_tbl %>% yardstick::conf_mat(truth, estimate)
# Accuracy
estimates_tbl %>% yardstick::metrics(truth, estimate)
# AUC
estimates_tbl %>% yardstick::roc_auc(truth, class_prob)
# ROC graph
# Plots the ROC curve (the ROC at all possible threshold values for the
# probability cutoff)
library(pROC)
roc(estimates_tbl$truth, estimates_tbl$class_prob) %>% plot
# Save the confusion matrix as an R object
conf_mat <- estimates_tbl %>% yardstick::conf_mat(truth, estimate)
# Plot the confusion matrix if you like
library(vcd)
labs <- round(prop.table(conf_mat$table), 2)
# Can change the margin to change the labels
mosaic(conf_mat$table, pop=FALSE)
labeling_cells(text = labs, margin = 0)(conf_mat$table)
################################################################################
## Probability vs. outcome plot for predicted probabilities
################################################################################
plotdf <- estimates_tbl %>%
mutate(prob_cut = percent_rank(class_prob)) %>%
# depending on how many unique probabilities your model produces you can try
# mutate(prob_cut = ntile(class_prob, 50)) %>%
group_by(prob_cut) %>%
summarize(
avg_prob = mean(class_prob),
prob_grad = sum(truth == "Grad") / length(truth),
count = n())
library(ggplot2)
library(ggalt) # for fun lollipop charts
ggplot(plotdf, aes(x = avg_prob, y = prob_grad)) +
ggalt::geom_lollipop() + theme_classic() +
coord_cartesian(xlim = c(0, 1), ylim = c(0, 1), expand=FALSE) +
geom_smooth(se=FALSE)
################################################################################
## Advanced: Make a Bowers plot
#################################################################################
# Get the data for comparable points
url <- "https://raw.githubusercontent.com/jknowles/DEWSatDoGoodData2016/master/data/BowersEWSReviewData.csv"
ews <- read.csv(url)
# Clean up the coding of the data with better labels
# This is optional, but it highlights some reference
# models
ews$flag <- "Other EWI"
ews$flag[ews$id==1 | ews$id==2] <- "Chicago On-Track"
ews$flag[ews$id > 3 & ews$id <14] <- "Balfanz ABC"
ews$flag[ews$id ==85] <- "Muth?n Math GMM"
ews$flag[ews$id==19] <- "Bowers GPA GMM"
ews$flag <- factor(ews$flag)
ews$flag <- relevel(ews$flag, ref="Other EWI")
mycol <- c("Other EWI" = "gray70", "Chicago On-Track" = "blue",
"Balfanz ABC" = "purple",
"Muth?n Math GMM" = "orange",
"Bowers GPA GMM" = "dark red")
# The big block of plotting code, you only need to modify the last
# couple of lines, the rest adds annotations and labels which you
# can switch on and off if you like
ggplot(ews) + aes(x = 1-specificity, y = sensitivity,
shape = flag, size = I(4), color = flag) +
geom_point() +
# Label the shape legend
scale_shape("EWI Type") +
# Label the color legend and customize it
scale_color_manual("EWI Type", values = mycol) +
# Add a reference 45 degree line representing random chance
geom_abline(intercept = 0, slope = 1, linetype = 2) +
# Set the scales of the chart to not distort distances
coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
# Add a simple theme to avoid clutter
theme_bw() +
# Label the axes
labs(x="False Alarm Proportion", y="True Positive Proportion",
title = "ROC Accuracy of Early Warning Indicators") +
# Place the legend in the bottom right
theme(legend.position = c(0.8, 0.2),
legend.background = element_rect(fill = NULL,
color = "black")) +
# Add arrows to annotate the plot
annotate(geom="segment", x = 0.55, y = 0.625,
yend = 0.785, xend = 0.4,
arrow = arrow(length = unit(0.5, "cm"))) +
# Label the arrow
annotate(geom="text", x = .365, y = .81, label="Better Prediction") +
annotate(geom="segment", x = 0.65, y = 0.625, yend = 0.5,
xend = 0.75, arrow = arrow(length = unit(0.5, "cm"))) +
annotate(geom="text", x = .75, y = .48, label="Worse Prediction") +
annotate(geom="text", x = .75, y = .78, angle = 37, label="Random Guess") +
# Add your annotation here:
annotate(geom="point",
x = 1-estimates_tbl %>% yardstick::spec(truth, estimate),
y = estimates_tbl %>% yardstick::sens(truth, estimate),
size = 5, color = "red")
# Use a custom probability threshold
---
title: "Predictive Analytics Tutorial"
author: "OpenSDP"
date: "September 9, 2017"
output:
word_document:
reference_docx: reference.docx
toc: true
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE, error = FALSE,
results = "hide", fig.show="hide")
```
## Introduction
This file provides guidance and R syntax examples for the hands-on
predictive analytics session during the Fall 2018 Cohort 9 Strategic Data
Project Workshop in Denver.
During the workshop, we'll ask you work with stakeholders to develop a dropout
early warning system for 7th grade students for a simulated state education
agency (the state of Montucky) using student data. This guide will provide you
with a baseline model to get you started and introduce you to the main concepts
of predictive anlaytics. From there, you will work together with stakeholders to
incorporate their values into the model, secure their support and adoption of
the model, and practice an inclusive and iterative design process to building
analytics.
As a data analyst, your role in this process is to measure the accuracy of the
model and communicate the tradeoffs of different models in terms of accuracy
and usability. Together with stakeholders, you might find that the most accurate
model does not reflect the values of the end the users. Perhaps the most accurate
model requires variables not available for all students, or predicts all students
in certain categories to be at-risk. These are issues you will have to work
together with stakeholders to address.
Logistic regression is one tool you can use, and we'll demonstrate it here.
There are many other techniques of increasing complexity. (Many of the best
predictive analytics packages are written in the R programming language.) But
for a binary outcome variable, most data scientists start with logistic
regressions, and those are very straightforward to do in R.
Here are the steps:
1. explore the data, especially college enrollment predictors and outcomes
2. examine the relationship between predictors and outcomes
3. evaluate the predictive power of different variables and select predictors for your model
4. make predictions using logistic regression
5. convert the predicted probabilities into a 0/1 indicator
6. look at the effect of different probability cutoffs on prediction accuracy (develop a "confusion matrix")
This guide will take you through these steps using a baseline model for predicting
graduation using student grade 7 data. During the workshop you will iterate on
this baseline model using the process above and find an improved model that
meets stakeholder needs.
The commands in this script won't tell you everything you need to do to develop
your model, but they will give you command
syntax that you should be able to adjust and adapt to get the project done.
You should also consider non-model approaches like the "checklist" approaches
described in the pre-reading created at the Chicago Consortium of School
Research (CCSR). With that "checklist" approach, you experiment with different
thresholds for your predictor variables, and combine them to directly predict
the outcome. This approach has the advantage of being easy to explain and
implement, but it might not yield the most accurate predictions. We won't
demonstrate that approach here, but if you want to try it you can draw on the
syntax examples here to develop it.
Before you get started, you need to think about variables, time, and datasets.
The sooner in a student's academic trajectory you can make a prediction, the
sooner you can intervene--but the less accurate your predictions, and hence your
intervention targeting, is likely to be. What data, and specifically which
variables, do you have available to make predictions? What outcome are you
trying to predict?
In the case of the workshop, we are limiting our focus to using data available
at the end of grade 7 to predict outcomes at the end of high school. A critical
step in developing a predictive model is to identify the time points that
different measures are collected during the process you are predicting -- you
can't use data fromthe future to make predictions. If you're planning to use
your model to make predictions for students at the end of 11th grade, for
instance, and if most students take AP classes as seniors, you can't use data
about AP coursetaking collected during senior year to predict the likelihood of
college enrollment, even if you have that data available for past groups of
students.
In terms of datasets, you can develop your model and then test its accuracy on
the dataset you used to develop the model, but that is bad practice--in the real
world, your model is only as good as its predictions on different, out of sample
datasets. It's good practice to split your data into three parts: one part for
developing your model, one for repeatedly testing different versions of your
model, and a third to use for a final out of sample test.
We're using multiple cohorts of middle-school students for the predictive analytics
task--students who were seventh graders between 2007 and in 2011. These are the
cohorts for which we have access to reliable information on their high school
graduation status (late graduate, on time graduate, dropout, transferout,
disappeared). For this workshop you are being given the entire set of data so
that you can explore different ways of organizing the data across cohorts.
An example is that you may choose to explore your models and data using the
earlier cohort, and evaluate their performance on more recent cohorts.
One last point -- even though the data is synthetic, we have simulated missing data
for you. In the real world, you'll need to make predictions for every
student, even if you're missing data for that student which your model needs in
order to run. Just making predictions using a logistic regression won't be
enough. You'll need to use decision rules based on good data exploration and
your best judgment to predict and fill in outcomes for students where you have
insufficient data.
## Getting Started
If you're using the `Rmd` file version of these materials, start by saving a new
version of the file, so you can edit it without worrying about overwriting the
original. Then work through the file inRStudio by highlighting one or a few
command lines at a time, clicking the "execute" icon (or pressing
control-enter), and then looking at the results in the R console. Edit or add
commands as you wish. If you're using a paper or PDF version of these
materials, just read on--the R output appears below each section of commands.
This guide is built using the data you will be working with on the workshop.
This dataset includes simulated data for multiple cohorts of 7th graders along
with their corresponding high school outcomes. Each observation (row) is a student,
and includes associated information about the student's demographics, academic
performance in 7th grade, associated school and district (in grade 7), last
high school attended, and high school completion.
To work through this script, you need to put the
`montucky.csv` data file on your computer in a working folder of
your choice, and then edit the commands below to
tell R where to look for the data. If you have trouble doing this, ask for
help from other members of your group.
## Setup
To prepare for this project you will need to ensure that your R installation
has the necessary add-on packages and that you can read in the training data.
```{r installPackages, eval=FALSE}
# Install add-on packages needed
install.packages("dplyr") # this will update your installed version to align with
install.packages("pROC") # those in the tutorial
install.packages("devtools")
install.packages("caret") # for machine learning
install.packages("future") # for multicore processing on Windows/Mac/Linux
```
```{r loadWorkspace}
# Load the packages you need
library(dplyr)
library(pROC)
library(devtools)
library(future)
# Load the helper functions not in packages
devtools::source_gist("ed47cd156462a9900df1f77a000f4a52",
filename = "helper_funcs.R")
# Read in the data
# This command assumes that the data is in a folder called data, below your
# current working directory. You can check your working directory with the
# getwd() command, and you can set your working directory using the RStudio
# environment, or the setwd() command.
load("data/montucky.rda")
```
### Validate the data
Ensure that the data imported correctly.
First, ensure that the data is unique by student ID.
```{r}
nrow(sea_data) == n_distinct(sea_data$sid)
```
Wait, what is the issue here?
```{r}
table(sea_data$sid == sea_data$sid[[1]]) # test how many times the first sid appears
```
Why might our IDs be repeated 45 times? Let's look at how many LEAs we have in
our SEA dataset:
```{r}
length(unique(sea_data$sch_g7_lea_id)) # test how many LEAs are in our data
```
We see that our student IDs are not unique by LEA. That's an easy enough
fix.
```{r}
nrow(sea_data) == n_distinct(sea_data$sid, sea_data$sch_g7_lea_id)
```
Let's append the LEA ID onto the student ID to make student IDs truly unique:
```{r}
sea_data$sid <- paste(sea_data$sid, sea_data$sch_g7_lea_id, sep = "-")
```
```{r}
nrow(sea_data) == n_distinct(sea_data$sid)
```
## Explore the Data
A key initial task in building an EWS is to identify the cohort membership of
students. When we build a predictive model need to identify two timepoints -
when we will be making the prediction, and when the outcome we are predicting
will be observed. In this case, we will be making the prediction upon receiving
the 7th grade data on students (so the completion of 7th grade), and we will
be predicting their completion of high school.
Let's focus first on identifying the 7th grade year for each student. We have
three year variables, what is their relationship:
```{r}
table(sea_data$year)
table(sea_data$cohort_year)
table(sea_data$cohort_grad_year)
```
From the data dictionary we know that the first year variable is the year
that corresponds with the student entering 7th grade (`year`). The `cohort_year`
variable defines the 9th grade cohort a student belongs to for measuring ontime
graduation. Finally, the `cohort_grad_year` is the year that the cohort a
student is a member of should graduate to be considered "on-time".
If a student graduates, their year of graduation is recorded as `year_of_graduation`.
The definition of graduation types is an important design decision for our
predictive analytic system. We have to set a time by which students are graduated
so we know whether to count them as graduating or not completing. The state of
Montucky uses a 4-year cohort graduation rate for most reporting, defining
ontime graduation as graduation within 4 years of entering high school. This
variable is defined as:
```{r}
table(sea_data$year_of_graduation == sea_data$cohort_grad_year)
table(sea_data$ontime_grad)
```
This is an example of a business rule - it's a restriction on the definition
of the data we make so that we can consistently use the data. In this case, it
is necessary so we can definitively group students for the purposes of predicting
their outcomes. You could consider alternative graduation timelines, for example:
```{r}
table(sea_data$year_of_graduation <= sea_data$cohort_grad_year + 1)
```
What does this rule say? How is it different than the definition of on-time above?
Now that we have a sense of the time structure of the data, let's look at
geography. How many high schools and how many districts are? What are those
regional education services coops?
We are going to be building a model for the an entire state, but stakeholders
may have questions about how the model works for particular schools, districts,
or regions. Let's practice exploring the data by these differerent geographies.
```{r}
length(unique(sea_data$first_hs_name))
length(unique(sea_data$first_hs_lea_id))
table(sea_data$coop_name_g7, useNA = "always")
```
For this exercise, districts in Montucky are organized into cooperative regions.
Cooperative regions are just groups of LEAs. It may be helpful to compare how
our model performs in a given school, LEA, or coop region to the rest of the
data in the state. As an example of this kind of analysis, select a specific
coop region and explore its data, drawing comparisons to the full dataset. Which
districts are part of this coop region and how many students do they have?
Substitute different abbreviation codes for different coops and then replace the
`my_coop` variable below.
```{r}
my_coop <- sea_data$coop_name_g7[50] # select the coop for the 50th observation
# Which districts are in this coop and how many 7th graders do we have for each?
table(sea_data$sch_g7_lea_id[sea_data$coop_name_g7 == my_coop],
useNA = "always")
# which schools?
table(sea_data$sch_g7_name[sea_data$coop_name_g7 == my_coop],
useNA = "always")
```
What student subgroups are we interested in? Let's start by looking at student subgroups.
Here's whether a student is male.
```{r}
table(sea_data$male, useNA="always")
```
Here's a short for loop to look at one-way tabs of a lot of variables at once.
```{r, eval=FALSE}
for(i in c("male", "race_ethnicity", "frpl_7", "iep_7", "ell_7",
"gifted_7")){
print(i)
print(table(sea_data[, i], useNA="always"))
}
```
Let's examine the distribution of student subgroups by geography. For this
command, we'll use the same looping syntax from above, which lets you avoid
repetition by applying commands to multiple variables at once. You can type
`?for` into the R console if you want to learn more about how to use loops in R.
```{r, eval=FALSE}
for(var in c("male", "race_ethnicity", "frpl_7", "iep_7", "ell_7",
"gifted_7")){
print(var)
print( # have to call print inside a loop
round( # round the result
prop.table( # convert table to percentages
table(sea_data$coop_name_g7, sea_data[, var], # build the table
useNA = "always"),
margin = 1), # calculate percentages by column, change to 1 for row
digits = 3) # round off at 3 digits
*100 ) # put on percentage instead of proportion scale
}
```
Now, let's look at high school outcomes. We won't examine them all, but you should.
Here's the ontime high school graduation outcome variable we looked at above:
```{r}
table(sea_data$ontime_grad, useNA = "always")
```
Wait! What if the data includes students who transferred out of state? That
might bias the graduation rate and make it too low, because those 7th graders
might show up as having dropped out.
```{r}
table(sea_data$transferout, useNA = "always")
table(transfer = sea_data$transferout, grad = sea_data$ontime_grad, useNA = "always")
```
This is another case where we may want to consider a business rule. How should
students who transfer out be treated? We don't know whether they graduated
or not. Should they be excluded from the analysis? Coded as not completing?
The decision is yours, but it is important to consider all the possible high
school outcomes when building a model and how the model will treat them.
Let's look at the distribution of another outcome variable, `any_grad`, which
includes late graduation and ontime graduation by both geography and by
subgroup.
```{r crosstab_grad, eval=FALSE}
round(
prop.table(
table(sea_data$coop_name_g7, sea_data$any_grad, useNA="always"),
margin = 1
),
digits = 2)
for(var in c("male", "race_ethnicity", "frpl_7", "iep_7", "ell_7",
"gifted_7")){
print(var)
print(
prop.table(
table(grad = sea_data$any_grad, var = sea_data[, var],
useNA = "always"),
margin = 1)
)
}
```
### Review Existing Indicator
Now that we are oriented to the data, let's turn our attention to the model
predictions provided by the vendor. First, let's check the format:
```{r coll_ready_tab}
summary(sea_data$vendor_ews_score)
```
Instead of classifying students, each student receives a predicted probability
for graduating. We need a way to convert this into a classification measure.
One way to do this is to pick a threshold and declare all values greater than
that threshold as being of the positive (graduation) class, and all values
below as being members of the negative class.
```{r}
set_thresh <- 0.5
conf_count <- table(observed = sea_data$ontime_grad,
pred = sea_data$vendor_ews_score > set_thresh)
conf_count
prop.table(conf_count)
```
This matrix tells us, for the threshold we have selected, how many graduates
we identified successfully, how many non-graduates we identified successfully,
and how many mistakes we made in each direction (this is known as a confusion
matrix and will be discussed at length at the workshop!).
How does this compare with the vendor's marketing material and stated accuracy?
Now, let's turn to identifying the predictors available for building an alternative
model. Let's examine the performance and behavioral variables that you can
use as predictors. These are mostly numerical variables, so you should use the
summary, histogram, and table commands to explore them. Here's some syntax for
examining 7th grade math scores. You can replicate and edit it to examine other
potential predictors and their distributions by different subgroups.
```{r}
summary(sea_data$scale_score_7_math)
hist(sea_data$scale_score_7_math)
```
```{r mean_score_by_demog}
by(sea_data$scale_score_7_math, sea_data$coop_name_g7, FUN = mean,
na.rm=TRUE)
by(sea_data$scale_score_7_math, sea_data$frpl_7, FUN = mean,
na.rm=TRUE)
```
Finally, here's some sample code you can use to look at missingness patterns in
the data. Note we use the `is.na()` function to test whether a value is missing.
```{r missingness_checks}
for(var in c("coop_name_g7", "male", "race_ethnicity")){
print(var)
print(
prop.table(table(sea_data[, var],
"missing_math" = is.na(sea_data$pct_days_absent_7)), 1)
)
}
```
Handling missing values is another case where business rules will come into play.
Did you see any outlier or impossible values while you were exploring the data?
If so, you might want to truncate them or change them to missing. Here's how you
can replace a numeric variable with a missing value if it is larger than a
certain number (in this case, 100 percent).
```{r abs_trunc}
hist(sea_data$pct_days_absent_7)
sea_data$pct_days_absent_7[sea_data$pct_days_absent_7 > 100] <- NA
hist(sea_data$pct_days_absent_7)
```
Trimming the data in this way is another example of a business rule. You
may wish to trim the absences even further in this data. You may also wish to
assign a different value other than missing for unusual values - such as the
mean or median value.
Now that you've explored the data, you can start to examine the relationship
between predictor and outcome variables. Here we'll continue to look at the high
school graduation outcome, and we'll restrict the predictors to just two: 7th
grade math scores and percent of enrolled days absent through 7th grade. For
your model, you can of course use more and different predictor
variables. First, check the correlation between outcome and predictors.
```{r}
cor(sea_data[, c("ontime_grad", "scale_score_7_math", "pct_days_absent_7")],
use = "pairwise.complete.obs")
```
These correlations do not look very promising! But remember, a correlation
may not tell the whole story if other factors are involved, and correlations
between binary variables and continuous variables are noisy indicators.
It would be nice to have a better idea of
the overall relationship between outcomes and predictors.
But you can't make a meaningful scatterplot when the independent, or y value, is
a binary outcome variable (try it!). Let's look at a technique to identify
the relationship between a continuous variable and a binary outcome.
The idea behind this code is to show the mean of the outcome variable for each
value of the predictor, or for categories of the predictor variable if it has
too many values. First, define categories (in this case, round to the nearest
percentage) of the percent absent variable, and then truncate the variable so that
low-frequency values are grouped together.
```{r}
sea_data$pct_absent_cat <- round(sea_data$pct_days_absent_7, digits = 0)
table(sea_data$pct_absent_cat)
sea_data$pct_absent_cat[sea_data$pct_absenct_cat >= 30] <- 30
```
Next, define a variable which is the average ontime graduation rate for each
absence category, and then make a scatter plot of average graduation rates by
absence percent.
```{r}
sea_data <- sea_data %>%
group_by(pct_absent_cat) %>% # perform the operation for each value
mutate(abs_ontime_grad = mean(ontime_grad, na.rm = TRUE)) # add a new variable
plot(sea_data$pct_absent_cat, sea_data$abs_ontime_grad)
```
You can do the same thing for 7th grade test scores. First look at the math
test score and notice that some scores appear to be outliers.
```{r}
hist(sea_data$scale_score_7_math)
sea_data$scale_score_7_math[sea_data$scale_score_7_math < 0] <- NA
hist(sea_data$scale_score_7_math)
```
You can do the same plot as above now by modifying the `group_by()`
command.
```{r}
sea_data <- sea_data %>%
mutate(math_7_cut = ntile(scale_score_7_math, n = 100)) %>%
group_by(math_7_cut) %>% # perform the operation for each value
mutate(math_7_ontime_grad = mean(ontime_grad, na.rm=TRUE)) # add a new variable
plot(sea_data$math_7_cut, sea_data$math_7_ontime_grad)
```
This is a neat trick you can use to communicate your model predictions as well.
## Model
Now we're ready to call on the `glm` command to compute a logistic regression
for the relationship between our binary outcome variable and our predictor
variables. When you run a logistic regression, R calculates the parameters of an
equation that fits the relationship between the predictor variables and the
outcome. A regression model typically won't be able to explain all of the
variation in an outcome variable--any variation that is left over is treated as
unexplained noise in the data, or error, even if there are additional variables
not in the model which could explain more of the variation.
Once you've run a logit regression, you can have R generate a variable with new,
predicted outcomes for each observation in your data with the `predict` command.
The predictions are calculated using the model equation and ignore the
unexplained noise in the data. For logit regressions, the predicted outcomes
take the form of a probability ranging 0 and 1. To start with, let's do a
regession of ontime graduation on eighth grade math scores.
```{r}
math_model <- glm(ontime_grad ~ scale_score_7_math, data = sea_data,
family = "binomial") # family tells R we want to fit a logistic
```
The default summary output for logistic regression in R is not very helpful.
```{r}
summary(math_model)
```
Even before you use the predict command, you can use the logit output to learn
something about the relationship between the predictor and the outcome variable.
The Pseudo $R^{2}$ (read R-squared) is a proxy for the share of variation in the
outcome variable that is explained by the predictor. Statisticians don't like it
when you take the pseudo $R^{2}$ too seriously, but it can be useful in predictive
exercises to quickly get a sense of the explanatory power of variables in a
logit model.
```{r}
logit_rsquared(math_model)
```
Does adding polynomial terms increase the pseudo $R^{2}$? You can use the formula
interface in R to add functional transformations of predictors without generating
new variables and find out.
```{r}
math_model2 <- glm(ontime_grad ~ scale_score_7_math +
I(scale_score_7_math^2) + I(scale_score_7_math^3),
data = sea_data,
family = "binomial") # family tells R we want to fit a logistic
logit_rsquared(math_model2)
```
The model did not improve very much. Any time you add predictors to a model,
the $R^{2}$ will increase, even if the variables are fairly meaningless, so it's
best to focus on including predictors that add meaningful explanatory power.
Now take a look at the $R^{2}$ for the absence variable.
```{r}
absence_model <- glm(ontime_grad ~ pct_days_absent_7, data = sea_data,
family = "binomial")
summary(absence_model)
logit_rsquared(absence_model)
```
Let's combine our two predictors and test their combined power.
```{r}
combined_model <- glm(ontime_grad ~ pct_days_absent_7 + scale_score_7_math,
data = sea_data, family = "binomial")
summary(combined_model)
logit_rsquared(combined_model)
```
Using this combined model, let's use the predict command to make our first
predictions.
```{r}
sea_data$grad_pred <- predict(combined_model, newdata = sea_data,
type = "response") # this tells R to give us a probability
```
This generates a new variable with the probability of ontime high school
graduation, according to the model. But if you look at the number of
observations with predictions, you'll see that it is smaller than the total
number of students. This is because R doesn't use observations that have
missing data for any of the variables in the model.
```{r}
table(is.na(sea_data$grad_pred))
```
Let's convert this probability to a 0/1 indicator for whether or not a student
is likely to graduate ontime. A good rule of thumb when starting out is to set
the probability cutoff at the mean of the outcome variable. In this example,
we can store this value as a variable:
```{r}
basic_thresh <- mean(sea_data$ontime_grad)
basic_thresh
```
If the probability in the model is equal to or
greater than this threshold, we'll say the student is likely to graduate.
```{r}
sea_data$grad_indicator <- ifelse(sea_data$grad_pred > basic_thresh, 1, 0)
table(sea_data$grad_indicator, useNA = "always")
```
You can also plot the relationship between the probability and the outcome.
Ideally, you should see the proportion of graduates steadily increase for each
percentile of the probabilities. What does this relationship tell you?
```{r}
sea_data <- sea_data %>%
mutate(grad_pred_cut = ntile(grad_pred, n = 100)) %>%
group_by(grad_pred_cut) %>% # perform the operation for each value
mutate(grad_pred_cut_grad = mean(ontime_grad, na.rm=TRUE)) # add a new variable
plot(sea_data$grad_pred_cut, sea_data$grad_pred_cut_grad)
```
Lets evaluate the accuracy of the model by comparing the predictions to the
actual graduation outcomes for the students for whom we have predictions. This
type of crosstab is called a "confusion matrix." The observations in the upper
right corner, where the indicator and the actual outcome are both 0, are true
negatives. The observations in the lower right corner, where the indicator and
the outcome are both 1, are true positives. The upper right corner contains
false positives, and the lower left corner contains false negatives. Overall, if
you add up the cell percentages for true positives and true negatives, the model
got 56.1 percent of the predictions right.
```{r}
prop.table(
table(
grad = sea_data$ontime_grad, pred = sea_data$grad_indicator)) %>% # shorthand way to round
round(3)
```
However, almost all of the wrong predictions are false negatives--these are
students who have been flagged as dropout risks even though they
did graduate ontime. If you want your indicator system to be have fewer false
negatives, you can change the probability cutoff. This cutoff has a lower share
of false positives and a higher share of false negatives, with a somewhat lower
share of correct predictions.
```{r}
new_thresh <- basic_thresh - 0.05
prop.table(table(Observed = sea_data$ontime_grad,
Predicted = sea_data$grad_pred > new_thresh)) %>%
round(3)
```
Note that this table only includes the complete cases. To look at missing values
as well:
```{r}
prop.table(table(Observed = sea_data$ontime_grad,
Predicted = sea_data$grad_pred > new_thresh,
useNA="always")) %>% round(3)
```
## Missing Data
Another key business rule is how we will handle students with missing data. A
predictive analytics system is more useful if it makes an actionable prediction
for every student. It is good to check, if it is available, the graduation rates
for students with missing data:
```{r}
table(Grad = sea_data$ontime_grad,
miss_math = is.na(sea_data$scale_score_7_math))
table(Grad = sea_data$ontime_grad,
miss_abs = is.na(sea_data$pct_days_absent_7))
```
There are a number of options at this point. One is to run a model with fewer
variables for only those students, and then use that model to fill in the
missing indicators.
```{r}
absence_model <- glm(ontime_grad ~ pct_days_absent_7,
data = sea_data[is.na(sea_data$scale_score_7_math),],
family = "binomial")
```
```{r}
sea_data$grad_pred_2 <- predict(absence_model, newdata = sea_data,
type = "response")
summary(absence_model)
```
```{r}
table(sea_data$grad_indicator, useNA="always")
sea_data$grad_indicator[is.na(sea_data$grad_pred) &
sea_data$grad_pred_2 < new_thresh] <- 0
sea_data$grad_indicator[is.na(sea_data$grad_pred) &
sea_data$grad_pred_2 >= new_thresh] <- 1
table(sea_data$grad_indicator, useNA="always")
```
We now have predictions for all but a very small share of students, and those
students are split between graduates and non-graduates. We have to apply a rule
or a model to make predictions for them--we can't use information from the
future, except to develop the prediction system. We'll arbitrarily decide to
flag them as potential non-graduates, since students with lots of missing data
might merit some extra attention.
```{r}
table(sea_data$grad_indicator, sea_data$ontime_grad, useNA = "always")
sea_data$grad_indicator[is.na(sea_data$grad_indicator)] <- 0
```
## Evaluate Fit
Now we have a complete set of predictions from our simple models. How well does
the prediction system work? Can we do better?
```{r}
table(Observed = sea_data$ontime_grad, Predicted = sea_data$grad_indicator) %>%
prop.table %>% round(4)
```
A confusion matrix is one way to evaluate the success of a model and evaluate
tradeoffs as you are developing prediction systems, but there are others. We
will cover these more in the workshop, but in cases where we have an uneven
proportion of cases in each class (e.g. we have many more graduates than non-graduates),
it can be helpful to look at a metric like the AUC, which stands for "area under the
curve." You'll learn more about ways to evaluate a prediction system, including
the AUC metric, during Day 1 of the workshop, but here's a sneak peak.
First, use the `roc` command to plot the true positive rate (sensitivity in
the graph) against the false positive rate (1-specificity in the graph).
```{r calculateROC}
roc(sea_data$ontime_grad, sea_data$grad_indicator) %>% plot
```
You can also calculate ROC on the continuouse predictor as well, to help you
determine the threshold:
```{r calculateROC2}
roc(sea_data$ontime_grad, sea_data$grad_pred) %>% plot
```
You can also calculate the numeric summaries instead of just the graphs. To
do this let's use the `caret` package:
```{r}
# We must wrap these each in calls to factor because of how this function expects
# the data to be formatted
caret::confusionMatrix(data = factor(sea_data$grad_indicator),
reference =factor(sea_data$ontime_grad), positive = "1")
```
A couple of last thoughts and notes. First, note that so far we haven't done any
out-of-sample testing. We know from the pre-reading that we should never trust
our model fit measures on data the model was fit to -- statistical models are
overly confident. To combat this, you should subdivide your dataset. There are
many strategies you can choose from depending on how much data you have and the
nature of your problem - for the EWS case, we can use the first two cohorts to
build our models and the latter two cohorts to evaluate that fit.
Here is some code you can use to do that:
```{r}
# In R we can define an index of rows so we do not have to copy our data
train_idx <- row.names(sea_data[sea_data$year %in% c(2003, 2004, 2005),])
test_idx <- !row.names(sea_data) %in% train_idx
fit_model <- glm(ontime_grad ~ scale_score_7_math + frpl_7,
data = sea_data[train_idx, ], family = "binomial")
sea_data$grad_pred_3 <- predict(fit_model, newdata = sea_data, type = "response")
summary(sea_data$grad_pred_3)
# Check the test index only
summary(sea_data$grad_pred_3[test_idx])
# calculate matrix of the test_index
table(Observed = sea_data$ontime_grad[test_idx],
Predicted = sea_data$grad_pred_3[test_idx] > new_thresh) %>%
prop.table() %>% round(4)
```
Second, should we use subgroup membership variables (such as demographics or
school of enrollment?) to make predictions, if they improve the accuracy of
predictions? This is more a policy question than a technical question, and you
should consider it when you are developing your models. You'll also want to
check to see how accurate your model is for different subgroups.
## Bonus
The above code will be more than enough to get you started. But, if you want
to reach further into predictive analytics, the next section provides some bonus
syntax and advice for building more complex statistical models.
Logistic regression is where you should start. It is fast to compute, easier to
interpret, and usually does a great job. However, there are a number of alternative
algorithms available to you and R provides a common interface to them through the
`caret` package. The `train` function (as in, training the models) is the
workhorse of the `caret` package. It has an extensive set of user-controlled
options.
When switching away from logistic regression, it can be advantageous to transform
predictors to be centered at 0 with a standard deviation of 1. This helps
put binary and continuous indicators on a more similar scale and helps avoid
problems associated with rounding, large numbers, and optimization algorithms
used to evaluate model fit. Here is an example of how you can do this in R:
```{r}
library(caret) # machine learning workhorse in R
sea_data <- as.data.frame(sea_data)
# To use caret we have to manually expand our categorical variables. caret
# can use formulas like lm() or glm() but becomes very slow and unreliable
# for some model/algorithm methods when doing so. Better to do it ourselves.
# Expand categorical variables
# First declare any variable we want to be treated as a category as a factor
sea_data$frpl_7 <- factor(sea_data$frpl_7)
sea_data$male <- factor(sea_data$male)
# Use the dummyVars function to build an expansion function to expand our data
expand_factors <- caret::dummyVars(~ race_ethnicity + frpl_7 + male,
data = sea_data)
# Create a matrix of indicator variables by using the predict method for the
# expand_factors object we just created. There are other ways you can do this, but
# the advantage is we can use the `expand_factors` object on future/different
# datasets to preserve factor levels that may not be present in those (like our
# test set perhaps)
fact_vars <- predict(expand_factors, sea_data)
# Get the column names of our categorical dummy variables
categ_vars <- colnames(fact_vars)
# Combine the new dummy variables with our original dataset
sea_data <- cbind(sea_data, fact_vars)
# Drop the matrix of dummy variables
rm(fact_vars)
# Define the numeric/continuous predictors we want to use
continuous_x_vars <- c('scale_score_7_math', 'scale_score_7_read',
'pct_days_absent_7', 'sch_g7_frpl_per')
# caret does not gracefully handle missing data, so we have to do some additional
# work when we define our training/test data split. You can choose additional
# ways to address missing data (imputation, substituting mean values, etc.) -
# here we opt for the simple but aggressive strategy of excluding any row
# with missing values for any predictor from being in the training data
train_idx <- row.names( # get the row.names to define our index
na.omit(sea_data[sea_data$year %in% c(2003, 2004, 2005),
# reduce the data to rows of specific years and not missing data
c(continuous_x_vars, categ_vars)])
# only select the predictors we are interested in
)
test_idx <- !row.names(sea_data[sea_data$year > 2005,]) %in% train_idx
# All of our algorithms will run much faster and more smoothly if we center
# our continuous measures at 0 with a standard deviation of 1. We have
# skipped the important step of identifying and removing outliers, which we
# should make sure to do, but is left up to you!
pre_proc <- caret::preProcess(sea_data[train_idx, continuous_x_vars],
method = c("scale", "center"))
# We now have a pre-processing object which will scale and center our variables
# for us. It will ignore any variables that are not defined within it, so we
# can pass all of our continuous and dummy variables to it to produce our
# final data frame of predictors.
preds <- predict(pre_proc,
sea_data[train_idx, # keep only the training rows
c(continuous_x_vars, categ_vars)]
) # keep only the columns of dummy and continuous variables
```
Once we have defined our predictor variables, we need to tell `train` how we want
to test our models. Most of the algorithms offered through `caret` have "tuning
parameters", user-controlled values, that are not estimated from the data. Our
goal is to experiment with these values and find the values that fit the data
the best. To do this, we must tell `train` which values to try, and how to
evaluate their performance.
Luckily, `train()` has a number of sensible defaults that largely automate this
process for us. For the purpose of this exercise, a good set of defaults is to
use the `twoClassSummary()` model evaluation function (which tells us the area
under the curve as well as the sensitivity, specificity, and accuracy) and
to use repeated cross-fold validation.
```{r}
# Take advantage of all the processing power on your machine
library(doFuture)
plan(multiprocess(workers = 8)) # define the number of cpus to use
registerDoFuture() # register them with R
# Caret really really really likes if you do binary classification that you
# code the variables as factors with alphabetical labels. In this case, we
# recode 0/1 to be nongrad, grad.
yvar <- sea_data[train_idx, "ontime_grad"] # save only training observations
yvar <- ifelse(yvar == 1, "grad", "nongrad")
yvar <- factor(yvar)
# On a standard desktop/laptop it can be necessary to decrease the sample size
# to train in a reasonable amount of time. For the prototype and getting feedback
# it's a good idea to stick with a reasonable sample size of under 30,000 rows.
# Let's do that here:
train_idx_small <- sample(1:nrow(preds), 2e4)
# Caret has a number of complex options, you can read about under ?trainControl
# Here we set some sensible defaults
example_control <- trainControl(
method = "cv", # we cross-validate our model to avoid overfitting
classProbs = TRUE, # we want to be able to predict probabilities, not just binary outcomes
returnData = TRUE, # we want to store the model data to allow for postestimation
summaryFunction = prSummary, # we want to use the prSummary for better two-class accuracy measures
trim = TRUE, # we want to reduce the size of the final model object to save RAM
savePredictions = "final", # we want to store the predictions from the final model
returnResamp = "final", # we want to store the resampling code to directly compare other methods
allowParallel = TRUE # we want to use all the processors on our computer if possible
)
# This will take quite some time:
set.seed(2532) # set seed so models are comparable
rpart_model <- train(
y = yvar[train_idx_small], # specify the outcome, here subset
# to the smaller number of observations
x = preds[train_idx_small,], # specify the matrix of predictors, again subset
method = "rpart", # choose the algorithm - here a regression tree
tuneLength = 24, # choose how many values of tuning parameters to test
trControl = example_control, # set up the conditions for the test (defined above)
metric = "AUC" # select the metric to choose the best model based on
)
set.seed(2532)
# Repeat above but just change the `method` argument to nb for naiveBayes
nb_model <- train(y = yvar[train_idx_small],
x = preds[train_idx_small,],
method = "nb", tuneLength = 24,
trControl = example_control,
metric = "AUC")
# Construct a list of model performance comparing the two models directly
resamps <- resamples(list(RPART = rpart_model,
NB = nb_model))
# Summarize it
summary(resamps)
# plot it
# see ?resamples for more options
dotplot(resamps, metric = "AUC")
```
## Test Your Results
Now we need to evaluate our performance on our hold-out set of data.
```{r}
# We use the pre-processing we defined on the training data, but this time
# we create test_data with these values:
test_data <- predict(pre_proc,
sea_data[test_idx, # specify the rows in the test set
c(continuous_x_vars, categ_vars)]) # keep the same vars
# Create a vector of outcomes for the test set
test_y <- sea_data[test_idx, "ontime_grad"]
test_y <- ifelse(test_y == 1, "grad", "nongrad")
test_y <- factor(test_y)
confusionMatrix(reference = test_y, data = predict(rpart_model, test_data))
# Note that making predictions from the nb classifier can take a long time
# consider alternative models or making fewer predictions if this is a bottleneck
confusionMatrix(reference = test_y, data = predict(nb_model, test_data))
```
We can also make ROC plots of our test-set, out of sample, prediction accuracy.
```{r}
# ROC plots
yhat <- predict(rpart_model, test_data, type = "prob")
yhat <- yhat$grad
roc(response = test_y, predictor = yhat <- yhat) %>% plot
# NB ROC plot
# Note that making predictions from the nb classifier can take a long time
# consider alternative models or making fewer predictions if this is a bottleneck
yhat <- predict(nb_model, test_data, type = "prob")
yhat <- yhat$grad
roc(response = test_y, predictor = yhat <- yhat) %>% plot
```
## References and More
The `caret` package has extensive documentation which you can use to find
new models, learn about algorithms, and explore many other important aspects
of building predictive analytics. You can learn more here:
https://topepo.github.io/caret/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment