Skip to content

Instantly share code, notes, and snippets.

@arcktip
Forked from bsmithgall/top_stations.R
Created December 30, 2013 05:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arcktip/8178217 to your computer and use it in GitHub Desktop.
Save arcktip/8178217 to your computer and use it in GitHub Desktop.
library('RCurl')
library('RJSONIO')
library('igraph')
stations.url <- getURL('http://citibikenyc.com/stations/json')
dist.url <- getURL('appservices.citibikenyc.com/data2/stations.php')
# prep stations for kmeans clustering
stations.prep <- function(url) {
stations.json2 <- fromJSON(url, method='C')
stations.list2 <- as.data.frame(matrix(unlist(
lapply(stations.json2$stationBeanList, function(x){
cbind(x$stationName, x$id, x$availableDocks, x$totalDocks, x$latitude, x$longitude)
})), nrow=length(stations.json2$stationBeanList), byrow=T)
)
# fix up some problems with factors
stations2 <- as.data.frame(cbind(
as.numeric(as.character(stations.list2$V2)),
as.numeric(as.character(stations.list2$V3)),
as.numeric(as.character(stations.list2$V4)),
as.numeric(as.character(stations.list2$V5)),
as.numeric(as.character(stations.list2$V6))
))
stations2$name<-as.character(stations.list2$V1)
names(stations2) <- c('id','available','total','lat','long', 'name')
stations.tocluster2 <- stations2[(
stations2$total-stations2$available)/stations2$total < .2,]
return(stations.tocluster2)
}
# cluster the stations
stations.kmeans <- function(stations.df) {
clusterables <- data.frame(stations.df$lat,stations.df$long)
row.names(clusterables) <- stations.df$id
clustered <- kmeans(clusterables, 4)
stations.cluster2 <- data.frame(stations.df, clustered$cluster)
return(stations.cluster2)
}
# prep functions for igraph analysis
degrees.to.radians<-function(degrees) {
return(degrees*pi/180)
}
haversine <- function(long1, lat1, long2, lat2) {
R <- 6371 # Earth mean radius [km]
delta.long <- (long2 - long1)
delta.lat <- (lat2 - lat1)
a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
c <- 2 * asin(min(1,sqrt(a)))
d = R * c
return(d) # Distance in km
}
haversine.distance <- function(long1, lat1, long2, lat2) {
long1 <- degrees.to.radians(long1)
lat1 <- degrees.to.radians(lat1)
long2 <- degrees.to.radians(long2)
lat2 <- degrees.to.radians(lat2)
return(haversine(long1, lat1, long2, lat2))
}
# function to generate graph data, a df of columns node1, node2, distance
graph.explode <- function(clustered.df) {
y <- data.frame(
t(apply(
combn(paste(clustered.df$id, clustered.df$lat, clustered.df$lon, sep=","), 2), 2,
function(i){
i.names <- unlist(strsplit(as.character(i), split='\\,'))
latslongs <- c(as.numeric(i.names[2]),as.numeric(i.names[3]),
as.numeric(i.names[5]),as.numeric(i.names[6]))
return(c(i.names[1],i.names[4],
haversine.distance(latslongs[1],latslongs[2],
latslongs[3],latslongs[4])))
})))
# cast the weight to numeric
y[,3] <- as.numeric(as.character(y[,3]))
# to get a better graph, we are going to dump connections with longer
# distances than half a mile (~.8 km)
y <- y[y[,3] < .8,]
return(y)
}
# now that we are ready, dump everything into an igraph,
# extract the degree centrality, and return it out as a data.frame
get.graph.data <- function(single.cluster) {
cluster.graph2 <- graph.explode(single.cluster)
g2 <- graph.data.frame(cluster.graph2)
V(g2)$degcent <- centralization.degree(g2)$res
q2 <- as.data.frame(as.numeric(as.matrix(V(g2)$name)))
q2$v2<-as.matrix(V(g2)$degcent)
names(q2) <- c('id','degcent')
q2<-q2[order(q2$id, decreasing=TRUE),]
output <- merge(q2, single.cluster, by="id")
return(output)
}
# now we have to parse out closest nodes (by dist) to each highly
# central node -- first we have to get the data about distances
get.dists <- function(url) {
dist.json <- fromJSON(url, method='C')
dist.list <- as.data.frame(matrix(unlist(
lapply(dist.json$results, function(i) {
unlist(c(i$id,lapply(i$nearbyStations, function(j){unlist(cbind(j[1]))})))
})), nrow=length(dist.json$results), byrow=TRUE))
names(dist.list) <- c('id','close.one','close.two',
'close.three','close.four','close.five')
return(dist.list)
}
# now we have the graph data and the distances, so we have to
# combine these two together to return a list of recommended
# stations to repopulate
make.recs <- function(graph.data, dists) {
i <- 1
close <- data.frame(check = numeric(0))
results <- data.frame(id = numeric(0), name = character(0),
available = numeric(0), total = numeric(0),
stringsAsFactors=FALSE)
comb <- merge(graph.data, dists, by="id")
comb <- comb[order(comb$degcent, decreasing = T),]
# oh no a loop! kill it with fire!
# but seriously there's only like 300 total rows so i'm not
# going to sweat this one too much. maybe it can be refactored
# later
while(nrow(results) < 4) {
j <- comb[i,]
close[nrow(close) + 1,] <- c(j$close.one)
close[nrow(close) + 1,] <- c(j$close.two)
close[nrow(close) + 1,] <- c(j$close.three)
close[nrow(close) + 1,] <- c(j$close.four)
close[nrow(close) + 1,] <- c(j$close.five)
if(j$id %in% close$check) { NA }
else { results[nrow(results) + 1,] <- c(j$id, j$name, j$available, j$total) }
i <- i+1
}
return(results)
}
main <- function() {
stations.tocluster <- stations.prep(stations.url)
stations.cluster <- stations.kmeans(stations.tocluster)
dists <- get.dists(dist.url)
output <- data.frame(id = numeric(0), name = character(0),
available = numeric(0), total = numeric(0),
stringsAsFactors=FALSE)
for(i in 1:4) {
j <- stations.cluster[stations.cluster$clustered.cluster==i,]
recs <- make.recs(get.graph.data(j), dists)
output <- rbind(output,recs)
}
return(output)
}
c <- main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment