Skip to content

Instantly share code, notes, and snippets.

@inkhorn
Created September 13, 2013 00:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save inkhorn/6545575 to your computer and use it in GitHub Desktop.
Save inkhorn/6545575 to your computer and use it in GitHub Desktop.
E-bike Survey Analysis
library(rpart)
library(plyr)
library(rpart.plot)
ebike = read.csv("E-Bike_Survey_Responses.csv")
# This next part is strictly to change any blank responses into NAs
ebike[,2:10][ebike[,2:10] == ''] = NA
# In this section we use mapvalues from the plyr package to get rid of blanks, but also
# to reduce the number of values in each factor that we use.
ebike$Sex = mapvalues(ebike$Sex, c('',levels(ebike$Sex)[-c(1,2,6)]), c('Other', rep("Other",10)))
ebike$Health = mapvalues(ebike$How.would.you.describe.your.level.of.physical.health., c('', levels(ebike$How.would.you.describe.your.level.of.physical.health.)[-c(1,4,5,6,12,13)]), c(NA, rep("Other",7)))
ebike$Edu = mapvalues(ebike[,5], c('', levels(ebike[,5])[-c(1,4,8,14,23)]), c(NA, rep('Other',20)))
ebike$Income = mapvalues(ebike[,6], '', NA)
ebike$Age = mapvalues(ebike[,2], '', NA)
# People put a lot of varying answers in here, but the categories I've chosen here can be found in most of them.
ebike$transport = factor(ifelse(grepl("bicycle",ebike[,11]),"Bicycle",
ifelse(grepl("e-bike", ebike[,11]), "E-bike",
ifelse(grepl("car", ebike[,11]), "Car",
ifelse(grepl("transit", ebike[,11]), "Transit","Other")))))
# Here we ask R to make two trees based first on Sex Health and Age (they seem like they go together)
# then based on education and income. You can try to put them together, but you will find that only some are
# chosen as the most significant for the classification. Therefore, keeping them apart describes for us
# E-bike users on separate dimensions.
b = rpart(transport == "E-bike"~ Sex + Health + Age, data=ebike)
c = rpart(transport == "E-bike" ~ Edu + Income, data=ebike)
# And here we plot the two Partition Tree models. I like seeing the factor label
# values in their entirety, so I've chosen a large enough number for the 'faclen' argument
# in each call to rpart.plot
rpart.plot(b, type=1,extra=1, varlen=0, faclen=10)
rpart.plot(c, type=1,extra=1, varlen=0, faclen=20)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment