awcull/analysis

## analysis
library(ggplot2)
library(scales)
library(reshape)
library(ggmap)
setwd("C:\\code\\Divvy_Stations_Trips_2013")

## Read data
data <- read.csv("Divvy_Trips_Clean_2013.csv", stringsAsFactors=F)
data.stations <- read.csv("Divvy_Stations_2013.csv")

## Easy stuff
ggplot(na.omit(data), aes(gender)) + geom_bar() + labs(title="Gender") # gender diff
ggsave(file="gender.png")
data$age <- as.numeric(format(Sys.Date(), "%Y")) - data$birthyear
ggplot(data, aes(age)) + geom_bar() + scale_x_continuous(breaks=seq(0,120,10)) + labs(title="Age Distribution") # age histogram
ggsave(file="age.png")

# Convert dates
data$starttime <- as.POSIXct(data$starttime)
data$stoptime <- as.POSIXct(data$stoptime)

## Take look at pickup and drop off times
# Need to times, due to how this works we just put all the times as one day
pickup.time <- as.POSIXct(format(data[,2], "%H:%M:%S"), "%H:%M:%S", tz="GMT")
dropoff.time <- as.POSIXct(format(data[,3], "%H:%M:%S"), "%H:%M:%S", tz="GMT")

pickup <- data.frame(time=pickup.time)
pickup$type <- "pickup"
dropoff <- data.frame(time=dropoff.time)
dropoff$type <- "dropoff"

time.data <- rbind(pickup,dropoff)
# pick up drop off times
ggplot(time.data, aes(time, fill=type)) + geom_density(alpha=0.4) +
  scale_x_datetime(breaks=date_breaks("2 hour"), labels=date_format("%H:%M")) +
  theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + labs(title="Pick Up and Drop Off")
ggsave(file="Pickup vs Dropoff.png")

## Look at days of the week
day.pickup <- weekdays(data$starttime)
day.dropoff <- weekdays(data$stoptime)
day.pickup.df <- data.frame(time=pickup.time, day=day.pickup)
day.dropoff.df <- data.frame(time=dropoff.time, day=day.dropoff)

# Daily pick up times
ggplot(day.pickup.df, aes(time, fill=day)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
  theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + scale_fill_brewer(palette="Set1") + labs(title="Daily Pick Up Times")
ggsave(file="daily pickup.png")
# Daily drop off times
ggplot(day.dropoff.df, aes(time, fill=day)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
  theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + scale_fill_brewer(palette="Set1") + labs(title="Daily Drop off Times")
ggsave(file="daily dropoff.png")

## Look at weekend/days
weekday <- c("Monday","Tuesday","Wednesday","Thursday","Friday")
weekend <- c("Saturday", "Sunday")
wk.day.end <- rep("", nrow(data))
wk.day.end[day.pickup %in% weekday] <- "Weekday"
wk.day.end[!(day.pickup %in% weekday)] <- "Weekend"
wk.day.end.df <- data.frame(time=pickup.time, type=wk.day.end)

# Weekend vs weekday pick up times
ggplot(wk.day.end.df, aes(time, fill=type)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
  theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + labs(title="Weekday vs Weekend Pickup Times")
ggsave(file="weekday vs weekend.png")

## Look at what times certain age ranges are using
age.brackets <- function(x) {
  siz <- length(x)
  tmp <- rep("", siz)
  tmp[which(x >= 0 & x < 13)] <- "Under 13"
  tmp[which(x >= 13 & x < 20)] <- "Teenager"
  tmp[which(x >= 20 & x <30)] <- "Twenties"
  tmp[which(x >= 30 & x < 40)] <- "Thirties"
  tmp[which(x >= 40 & x < 50)] <- "Forties"
  tmp[which(x >= 50 & x < 60)] <- "Fifties"
  tmp[which(x >= 60 & x < 70)] <- "Sixties"
  tmp[which(x >= 70 & x < 80)] <- "Seventies"
  return(tmp)
}
time.age <- data.frame(time=pickup.time, age=age.brackets(data$age))
time.age <- time.age[time.age$age != "",]
# Pick up times by age
ggplot(time.age, aes(time, fill=age)) + geom_bar() + scale_fill_brewer(palette="Set1") + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
  theme(axis.text.x=element_text(angle=90)) + labs(title="Age Pickup Times")
ggsave(file="agePickup.png")

## Stations
stat.table <- table(data$from_station_name, data$to_station_name) # Create table
stat.table.df <- as.data.frame(stat.table) # Coerce to DF
stat.table.df <- stat.table.df[order(stat.table.df$Freq, decreasing=T),] # Order by most frequent from -> to station
# Pull out used and unused
stat.used <- stat.table.df[stat.table.df$Freq > 0,]
stat.used$scale <- stat.used$Freq / sum(stat.used$Freq)
stat.unused <- stat.table.df[stat.table.df$Freq == 0,]

# Plot up top 15 stations as a heatmap.  Cant plot up full stations because of large number in a sparse matrix
ggplot(stat.used[1:15,], aes(Var1, Var2)) + geom_tile(aes(fill=Freq)) +
  scale_fill_gradient(low="white", high="steelblue") + labs(x="",y="") +
  theme(axis.text.x=element_text(angle=90), axis.ticks=element_blank(), legend.position="none") + labs(title="Usage of top 15 Stations")
ggsave(file="heatmap.png")

# Try and make map of routes
# merge locations with coordinates
data.tmp <- ddply(data.frame(stations=c(data$from_station_name, data$to_station_name)), .(stations), summarize, log.freq=log(length(stations)))
data.merged <- merge(data.tmp, data.stations, by.x="stations", by.y="name")

# Get map
map <- get_map(location=c(lon=mean(data.merged$longitude), lat=mean(data.merged$latitude)), zoom=12, color="bw")

# Quick look at where the uses are
ggmap(map) + geom_point(aes(x=longitude, y=latitude, colour=log.freq), data=data.merged) + scale_colour_gradient(low="white", high="black") +
  theme(axis.title.x=element_blank(), axis.title.y=element_blank()) + labs(title="Station Use")
ggsave(file="mapColorFreq.png")

# Overlay a heatmap onto it, using different colours
ggmap(map) + stat_density2d(mapping=aes(x=longitude, y=latitude, fill=log.freq, alpha=..level..), geom="polygon",data=data.merged) +
  geom_point(aes(x=longitude, y=latitude, colour=log.freq), data=data.merged) + scale_colour_gradient(low="black", high="steelblue") +
  theme(legend.position="none", axis.title.x=element_blank(), axis.title.y=element_blank()) + labs(title="Station Use HeatMap")
ggsave(file="mapHeatFreq.png")

## cleanData
setwd("C:\\code\\Divvy_Stations_Trips_2013")

## Read data
data <- read.csv("Divvy_Trips_2013.csv", stringsAsFactors=F)

## Look at what kind of data we have first
summary(data)

## starttime and stoptime should be a POSIXct class, also want to change it to ISO style of %Y-%m-%d %H:%M:%S, makes stuff later easier
# Take a look at values to know format
head(data$starttime)
head(data$stoptime) # check to make sure data is same format between the two
# Format then is %m/%d/%Y %HH:%MM
time.format <- "%m/%d/%Y %H:%M"
data$starttime <- as.POSIXct(data$starttime, time.format, tz="America/Chicago")
data$stoptime <- as.POSIXct(data$stoptime, time.format, tz="America/Chicago")

## From this tripduration/from_station_id/to_station_id are reported as characters but really should be integers
# Find out what values we are dealing with
unique(data$tripduration)
sort(unique(data$to_station_id)))
sort(unique(data$from_station_id))

# From this the easiest fix is tripduration.  Values have a , in them!
data$tripduration <- as.numeric(gsub(",", "", data$tripduration))
# Next up lets find out what is causing the #N/A in to_station_id and from_station_id
tmp_1 <- which(data$to_station_id == "#N/A")
head(data[tmp_1,])
# Looks like its Congress Pkwy & Ogden Ave that may be causing the issue, but lets double check this
unique(data[tmp_1,]$to_station_name)
# only get the one, now lets check again for from_station_name. Most likely will have the same issue
tmp_2 <- which(data$from_station_id == "#N/A")
head(data[tmp_2,])
unique(data[tmp_2,]$from_station_name)
# Again same issue, so lets assign our own station id
known_station_ids <- sort(unique(as.numeric(c(data$from_station_id, data$to_station_id))))
# Check to see if value is there, if not the station becomes that id
tmp_id <- 9999
is.done <- F
while (!(is.done)) {
  if (!(tmp_id %in% known_station_ids)) {
    data$to_station_id[tmp_1] <- tmp_id
    data$from_station_id[tmp_2] <- tmp_id
    is.done <- T
  }
  tmp_id <- tmp_id + 1
}
# Convert over to numeric
data$to_station_id <- as.numeric(data$to_station_id)
data$from_station_id <- as.numeric(data$from_station_id)

## Save data
write.csv(data, "Divvy_Trips_Clean_2013.csv", row.names=F)
	library(ggplot2)
	library(scales)
	library(reshape)
	library(ggmap)
	setwd("C:\\code\\Divvy_Stations_Trips_2013")

	## Read data
	data <- read.csv("Divvy_Trips_Clean_2013.csv", stringsAsFactors=F)
	data.stations <- read.csv("Divvy_Stations_2013.csv")

	## Easy stuff
	ggplot(na.omit(data), aes(gender)) + geom_bar() + labs(title="Gender") # gender diff
	ggsave(file="gender.png")
	data$age <- as.numeric(format(Sys.Date(), "%Y")) - data$birthyear
	ggplot(data, aes(age)) + geom_bar() + scale_x_continuous(breaks=seq(0,120,10)) + labs(title="Age Distribution") # age histogram
	ggsave(file="age.png")

	# Convert dates
	data$starttime <- as.POSIXct(data$starttime)
	data$stoptime <- as.POSIXct(data$stoptime)

	## Take look at pickup and drop off times
	# Need to times, due to how this works we just put all the times as one day
	pickup.time <- as.POSIXct(format(data[,2], "%H:%M:%S"), "%H:%M:%S", tz="GMT")
	dropoff.time <- as.POSIXct(format(data[,3], "%H:%M:%S"), "%H:%M:%S", tz="GMT")

	pickup <- data.frame(time=pickup.time)
	pickup$type <- "pickup"
	dropoff <- data.frame(time=dropoff.time)
	dropoff$type <- "dropoff"

	time.data <- rbind(pickup,dropoff)
	# pick up drop off times
	ggplot(time.data, aes(time, fill=type)) + geom_density(alpha=0.4) +
	scale_x_datetime(breaks=date_breaks("2 hour"), labels=date_format("%H:%M")) +
	theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + labs(title="Pick Up and Drop Off")
	ggsave(file="Pickup vs Dropoff.png")

	## Look at days of the week
	day.pickup <- weekdays(data$starttime)
	day.dropoff <- weekdays(data$stoptime)
	day.pickup.df <- data.frame(time=pickup.time, day=day.pickup)
	day.dropoff.df <- data.frame(time=dropoff.time, day=day.dropoff)

	# Daily pick up times
	ggplot(day.pickup.df, aes(time, fill=day)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
	theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + scale_fill_brewer(palette="Set1") + labs(title="Daily Pick Up Times")
	ggsave(file="daily pickup.png")
	# Daily drop off times
	ggplot(day.dropoff.df, aes(time, fill=day)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
	theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + scale_fill_brewer(palette="Set1") + labs(title="Daily Drop off Times")
	ggsave(file="daily dropoff.png")

	## Look at weekend/days
	weekday <- c("Monday","Tuesday","Wednesday","Thursday","Friday")
	weekend <- c("Saturday", "Sunday")
	wk.day.end <- rep("", nrow(data))
	wk.day.end[day.pickup %in% weekday] <- "Weekday"
	wk.day.end[!(day.pickup %in% weekday)] <- "Weekend"
	wk.day.end.df <- data.frame(time=pickup.time, type=wk.day.end)

	# Weekend vs weekday pick up times
	ggplot(wk.day.end.df, aes(time, fill=type)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
	theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + labs(title="Weekday vs Weekend Pickup Times")
	ggsave(file="weekday vs weekend.png")

	## Look at what times certain age ranges are using
	age.brackets <- function(x) {
	siz <- length(x)
	tmp <- rep("", siz)
	tmp[which(x >= 0 & x < 13)] <- "Under 13"
	tmp[which(x >= 13 & x < 20)] <- "Teenager"
	tmp[which(x >= 20 & x <30)] <- "Twenties"
	tmp[which(x >= 30 & x < 40)] <- "Thirties"
	tmp[which(x >= 40 & x < 50)] <- "Forties"
	tmp[which(x >= 50 & x < 60)] <- "Fifties"
	tmp[which(x >= 60 & x < 70)] <- "Sixties"
	tmp[which(x >= 70 & x < 80)] <- "Seventies"
	return(tmp)
	}
	time.age <- data.frame(time=pickup.time, age=age.brackets(data$age))
	time.age <- time.age[time.age$age != "",]
	# Pick up times by age
	ggplot(time.age, aes(time, fill=age)) + geom_bar() + scale_fill_brewer(palette="Set1") + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
	theme(axis.text.x=element_text(angle=90)) + labs(title="Age Pickup Times")
	ggsave(file="agePickup.png")

	## Stations
	stat.table <- table(data$from_station_name, data$to_station_name) # Create table
	stat.table.df <- as.data.frame(stat.table) # Coerce to DF
	stat.table.df <- stat.table.df[order(stat.table.df$Freq, decreasing=T),] # Order by most frequent from -> to station
	# Pull out used and unused
	stat.used <- stat.table.df[stat.table.df$Freq > 0,]
	stat.used$scale <- stat.used$Freq / sum(stat.used$Freq)
	stat.unused <- stat.table.df[stat.table.df$Freq == 0,]

	# Plot up top 15 stations as a heatmap. Cant plot up full stations because of large number in a sparse matrix
	ggplot(stat.used[1:15,], aes(Var1, Var2)) + geom_tile(aes(fill=Freq)) +
	scale_fill_gradient(low="white", high="steelblue") + labs(x="",y="") +
	theme(axis.text.x=element_text(angle=90), axis.ticks=element_blank(), legend.position="none") + labs(title="Usage of top 15 Stations")
	ggsave(file="heatmap.png")

	# Try and make map of routes
	# merge locations with coordinates
	data.tmp <- ddply(data.frame(stations=c(data$from_station_name, data$to_station_name)), .(stations), summarize, log.freq=log(length(stations)))
	data.merged <- merge(data.tmp, data.stations, by.x="stations", by.y="name")

	# Get map
	map <- get_map(location=c(lon=mean(data.merged$longitude), lat=mean(data.merged$latitude)), zoom=12, color="bw")

	# Quick look at where the uses are
	ggmap(map) + geom_point(aes(x=longitude, y=latitude, colour=log.freq), data=data.merged) + scale_colour_gradient(low="white", high="black") +
	theme(axis.title.x=element_blank(), axis.title.y=element_blank()) + labs(title="Station Use")
	ggsave(file="mapColorFreq.png")

	# Overlay a heatmap onto it, using different colours
	ggmap(map) + stat_density2d(mapping=aes(x=longitude, y=latitude, fill=log.freq, alpha=..level..), geom="polygon",data=data.merged) +
	geom_point(aes(x=longitude, y=latitude, colour=log.freq), data=data.merged) + scale_colour_gradient(low="black", high="steelblue") +
	theme(legend.position="none", axis.title.x=element_blank(), axis.title.y=element_blank()) + labs(title="Station Use HeatMap")
	ggsave(file="mapHeatFreq.png")
	setwd("C:\\code\\Divvy_Stations_Trips_2013")

	## Read data
	data <- read.csv("Divvy_Trips_2013.csv", stringsAsFactors=F)

	## Look at what kind of data we have first
	summary(data)

	## starttime and stoptime should be a POSIXct class, also want to change it to ISO style of %Y-%m-%d %H:%M:%S, makes stuff later easier
	# Take a look at values to know format
	head(data$starttime)
	head(data$stoptime) # check to make sure data is same format between the two
	# Format then is %m/%d/%Y %HH:%MM
	time.format <- "%m/%d/%Y %H:%M"
	data$starttime <- as.POSIXct(data$starttime, time.format, tz="America/Chicago")
	data$stoptime <- as.POSIXct(data$stoptime, time.format, tz="America/Chicago")

	## From this tripduration/from_station_id/to_station_id are reported as characters but really should be integers
	# Find out what values we are dealing with
	unique(data$tripduration)
	sort(unique(data$to_station_id)))
	sort(unique(data$from_station_id))

	# From this the easiest fix is tripduration. Values have a , in them!
	data$tripduration <- as.numeric(gsub(",", "", data$tripduration))
	# Next up lets find out what is causing the #N/A in to_station_id and from_station_id
	tmp_1 <- which(data$to_station_id == "#N/A")
	head(data[tmp_1,])
	# Looks like its Congress Pkwy & Ogden Ave that may be causing the issue, but lets double check this
	unique(data[tmp_1,]$to_station_name)
	# only get the one, now lets check again for from_station_name. Most likely will have the same issue
	tmp_2 <- which(data$from_station_id == "#N/A")
	head(data[tmp_2,])
	unique(data[tmp_2,]$from_station_name)
	# Again same issue, so lets assign our own station id
	known_station_ids <- sort(unique(as.numeric(c(data$from_station_id, data$to_station_id))))
	# Check to see if value is there, if not the station becomes that id
	tmp_id <- 9999
	is.done <- F
	while (!(is.done)) {
	if (!(tmp_id %in% known_station_ids)) {
	data$to_station_id[tmp_1] <- tmp_id
	data$from_station_id[tmp_2] <- tmp_id
	is.done <- T
	}
	tmp_id <- tmp_id + 1
	}
	# Convert over to numeric
	data$to_station_id <- as.numeric(data$to_station_id)
	data$from_station_id <- as.numeric(data$from_station_id)

	## Save data
	write.csv(data, "Divvy_Trips_Clean_2013.csv", row.names=F)