inkhorn/ebike.r

## ebike.r
library(rpart)
library(plyr)
library(rpart.plot)

ebike = read.csv("E-Bike_Survey_Responses.csv")

# This next part is strictly to change any blank responses into NAs
ebike[,2:10][ebike[,2:10] == ''] = NA

# In this section we use mapvalues from the plyr package to get rid of blanks, but also
# to reduce the number of values in each factor that we use.

ebike$Sex = mapvalues(ebike$Sex, c('',levels(ebike$Sex)[-c(1,2,6)]), c('Other', rep("Other",10)))
ebike$Health = mapvalues(ebike$How.would.you.describe.your.level.of.physical.health., c('', levels(ebike$How.would.you.describe.your.level.of.physical.health.)[-c(1,4,5,6,12,13)]), c(NA, rep("Other",7)))
ebike$Edu = mapvalues(ebike[,5], c('', levels(ebike[,5])[-c(1,4,8,14,23)]), c(NA, rep('Other',20)))
ebike$Income = mapvalues(ebike[,6], '', NA)
ebike$Age = mapvalues(ebike[,2], '', NA)

# People put a lot of varying answers in here, but the categories I've chosen here can be found in most of them.
ebike$transport = factor(ifelse(grepl("bicycle",ebike[,11]),"Bicycle",
                  ifelse(grepl("e-bike", ebike[,11]), "E-bike",
                  ifelse(grepl("car", ebike[,11]), "Car",
                  ifelse(grepl("transit", ebike[,11]), "Transit","Other")))))

# Here we ask R to make two trees based first on Sex Health and Age (they seem like they go together)
# then based on education and income.  You can try to put them together, but you will find that only some are
# chosen as the most significant for the classification.  Therefore, keeping them apart describes for us
# E-bike users on separate dimensions.

b = rpart(transport == "E-bike"~ Sex + Health + Age, data=ebike)
c = rpart(transport == "E-bike" ~ Edu + Income, data=ebike)

# And here we plot the two Partition Tree models.  I like seeing the factor label
# values in their entirety, so I've chosen a large enough number for the 'faclen' argument
# in each call to rpart.plot

rpart.plot(b, type=1,extra=1, varlen=0, faclen=10)
rpart.plot(c, type=1,extra=1, varlen=0, faclen=20)
	library(rpart)
	library(plyr)
	library(rpart.plot)

	ebike = read.csv("E-Bike_Survey_Responses.csv")

	# This next part is strictly to change any blank responses into NAs
	ebike[,2:10][ebike[,2:10] == ''] = NA

	# In this section we use mapvalues from the plyr package to get rid of blanks, but also
	# to reduce the number of values in each factor that we use.

	ebike$Sex = mapvalues(ebike$Sex, c('',levels(ebike$Sex)[-c(1,2,6)]), c('Other', rep("Other",10)))
	ebike$Health = mapvalues(ebike$How.would.you.describe.your.level.of.physical.health., c('', levels(ebike$How.would.you.describe.your.level.of.physical.health.)[-c(1,4,5,6,12,13)]), c(NA, rep("Other",7)))
	ebike$Edu = mapvalues(ebike[,5], c('', levels(ebike[,5])[-c(1,4,8,14,23)]), c(NA, rep('Other',20)))
	ebike$Income = mapvalues(ebike[,6], '', NA)
	ebike$Age = mapvalues(ebike[,2], '', NA)

	# People put a lot of varying answers in here, but the categories I've chosen here can be found in most of them.
	ebike$transport = factor(ifelse(grepl("bicycle",ebike[,11]),"Bicycle",
	ifelse(grepl("e-bike", ebike[,11]), "E-bike",
	ifelse(grepl("car", ebike[,11]), "Car",
	ifelse(grepl("transit", ebike[,11]), "Transit","Other")))))

	# Here we ask R to make two trees based first on Sex Health and Age (they seem like they go together)
	# then based on education and income. You can try to put them together, but you will find that only some are
	# chosen as the most significant for the classification. Therefore, keeping them apart describes for us
	# E-bike users on separate dimensions.

	b = rpart(transport == "E-bike"~ Sex + Health + Age, data=ebike)
	c = rpart(transport == "E-bike" ~ Edu + Income, data=ebike)

	# And here we plot the two Partition Tree models. I like seeing the factor label
	# values in their entirety, so I've chosen a large enough number for the 'faclen' argument
	# in each call to rpart.plot

	rpart.plot(b, type=1,extra=1, varlen=0, faclen=10)
	rpart.plot(c, type=1,extra=1, varlen=0, faclen=20)