arcktip/top_stations.R

## top_stations.R
library('RCurl')
library('RJSONIO')
library('igraph')

stations.url <- getURL('http://citibikenyc.com/stations/json')
dist.url <- getURL('appservices.citibikenyc.com/data2/stations.php')

# prep stations for kmeans clustering
stations.prep <- function(url) {
  stations.json2 <- fromJSON(url, method='C')

  stations.list2 <- as.data.frame(matrix(unlist(
    lapply(stations.json2$stationBeanList, function(x){
      cbind(x$stationName, x$id, x$availableDocks, x$totalDocks, x$latitude, x$longitude)
    })), nrow=length(stations.json2$stationBeanList), byrow=T)
  )

  # fix up some problems with factors
  stations2 <- as.data.frame(cbind(
    as.numeric(as.character(stations.list2$V2)),
    as.numeric(as.character(stations.list2$V3)),
    as.numeric(as.character(stations.list2$V4)),
    as.numeric(as.character(stations.list2$V5)),
    as.numeric(as.character(stations.list2$V6))
  ))
  stations2$name<-as.character(stations.list2$V1)

  names(stations2) <- c('id','available','total','lat','long', 'name')
  stations.tocluster2 <- stations2[(
    stations2$total-stations2$available)/stations2$total < .2,]
  return(stations.tocluster2)

}

# cluster the stations
stations.kmeans <- function(stations.df) {
  clusterables <- data.frame(stations.df$lat,stations.df$long)
  row.names(clusterables) <- stations.df$id
  clustered <- kmeans(clusterables, 4)
  stations.cluster2 <- data.frame(stations.df, clustered$cluster)

  return(stations.cluster2)
}

# prep functions for igraph analysis
degrees.to.radians<-function(degrees) {
  return(degrees*pi/180)
}

haversine <- function(long1, lat1, long2, lat2) {
  R <- 6371 # Earth mean radius [km]
  delta.long <- (long2 - long1)
  delta.lat <- (lat2 - lat1)
  a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
  c <- 2 * asin(min(1,sqrt(a)))
  d = R * c
  return(d) # Distance in km
}

haversine.distance <- function(long1, lat1, long2, lat2) {
  long1 <- degrees.to.radians(long1)
  lat1 <- degrees.to.radians(lat1)
  long2 <- degrees.to.radians(long2)
  lat2 <- degrees.to.radians(lat2)

  return(haversine(long1, lat1, long2, lat2))
}

# function to generate graph data, a df of columns node1, node2, distance

graph.explode <- function(clustered.df) {
  y <- data.frame(
    t(apply(
      combn(paste(clustered.df$id, clustered.df$lat, clustered.df$lon, sep=","), 2), 2,
      function(i){
        i.names <- unlist(strsplit(as.character(i), split='\\,'))
        latslongs <- c(as.numeric(i.names[2]),as.numeric(i.names[3]),
                       as.numeric(i.names[5]),as.numeric(i.names[6]))
        return(c(i.names[1],i.names[4],
                 haversine.distance(latslongs[1],latslongs[2],
                                    latslongs[3],latslongs[4])))
      })))
  # cast the weight to numeric
  y[,3] <- as.numeric(as.character(y[,3]))
  # to get a better graph, we are  going to dump connections with longer
  # distances than half a mile (~.8 km)
  y <- y[y[,3] < .8,]
  return(y)
}

# now that we are ready, dump everything into an igraph,
# extract the degree centrality, and return it out as a data.frame

get.graph.data <- function(single.cluster) {
  cluster.graph2 <- graph.explode(single.cluster)
  g2 <- graph.data.frame(cluster.graph2)
  V(g2)$degcent <- centralization.degree(g2)$res

  q2 <- as.data.frame(as.numeric(as.matrix(V(g2)$name)))
  q2$v2<-as.matrix(V(g2)$degcent)

  names(q2) <- c('id','degcent')
  q2<-q2[order(q2$id, decreasing=TRUE),]
  output <- merge(q2, single.cluster, by="id")
  return(output)
}

# now we have to parse out closest nodes (by dist) to each highly
# central node -- first we have to get the data about distances

get.dists <- function(url) {
  dist.json <- fromJSON(url, method='C')
  dist.list <- as.data.frame(matrix(unlist(
    lapply(dist.json$results, function(i) {
      unlist(c(i$id,lapply(i$nearbyStations, function(j){unlist(cbind(j[1]))})))
    })), nrow=length(dist.json$results), byrow=TRUE))
  names(dist.list) <- c('id','close.one','close.two',
                        'close.three','close.four','close.five')
  return(dist.list)
}

# now we have the graph data and the distances, so we have to
# combine these two together to return a list of recommended
# stations to repopulate

make.recs <- function(graph.data, dists) {
  i <- 1
  close <- data.frame(check = numeric(0))
  results <- data.frame(id = numeric(0), name = character(0),
                        available = numeric(0), total = numeric(0),
                        stringsAsFactors=FALSE)
  comb <- merge(graph.data, dists, by="id")
  comb <- comb[order(comb$degcent, decreasing = T),]
  # oh no a loop! kill it with fire!
  # but seriously there's only like 300 total rows so i'm not
  # going to sweat this one too much. maybe it can be refactored
  # later
  while(nrow(results) < 4) {
    j <- comb[i,]
    close[nrow(close) + 1,] <- c(j$close.one)
    close[nrow(close) + 1,] <- c(j$close.two)
    close[nrow(close) + 1,] <- c(j$close.three)
    close[nrow(close) + 1,] <- c(j$close.four)
    close[nrow(close) + 1,] <- c(j$close.five)
    if(j$id %in% close$check) { NA }
    else { results[nrow(results) + 1,] <- c(j$id, j$name, j$available, j$total) }
    i <- i+1
  }
  return(results)
}

main <- function() {
  stations.tocluster <- stations.prep(stations.url)
  stations.cluster <- stations.kmeans(stations.tocluster)
  dists <- get.dists(dist.url)
  output <- data.frame(id = numeric(0), name = character(0),
                        available = numeric(0), total = numeric(0),
                        stringsAsFactors=FALSE)
  for(i in 1:4) {
    j <- stations.cluster[stations.cluster$clustered.cluster==i,]
    recs <- make.recs(get.graph.data(j), dists)
    output <- rbind(output,recs)
  }
  return(output)
}

c <- main()
	library('RCurl')
	library('RJSONIO')
	library('igraph')

	stations.url <- getURL('http://citibikenyc.com/stations/json')
	dist.url <- getURL('appservices.citibikenyc.com/data2/stations.php')

	# prep stations for kmeans clustering
	stations.prep <- function(url) {
	stations.json2 <- fromJSON(url, method='C')

	stations.list2 <- as.data.frame(matrix(unlist(
	lapply(stations.json2$stationBeanList, function(x){
	cbind(x$stationName, x$id, x$availableDocks, x$totalDocks, x$latitude, x$longitude)
	})), nrow=length(stations.json2$stationBeanList), byrow=T)
	)

	# fix up some problems with factors
	stations2 <- as.data.frame(cbind(
	as.numeric(as.character(stations.list2$V2)),
	as.numeric(as.character(stations.list2$V3)),
	as.numeric(as.character(stations.list2$V4)),
	as.numeric(as.character(stations.list2$V5)),
	as.numeric(as.character(stations.list2$V6))
	))
	stations2$name<-as.character(stations.list2$V1)

	names(stations2) <- c('id','available','total','lat','long', 'name')
	stations.tocluster2 <- stations2[(
	stations2$total-stations2$available)/stations2$total < .2,]
	return(stations.tocluster2)

	}

	# cluster the stations
	stations.kmeans <- function(stations.df) {
	clusterables <- data.frame(stations.df$lat,stations.df$long)
	row.names(clusterables) <- stations.df$id
	clustered <- kmeans(clusterables, 4)
	stations.cluster2 <- data.frame(stations.df, clustered$cluster)

	return(stations.cluster2)
	}

	# prep functions for igraph analysis
	degrees.to.radians<-function(degrees) {
	return(degrees*pi/180)
	}

	haversine <- function(long1, lat1, long2, lat2) {
	R <- 6371 # Earth mean radius [km]
	delta.long <- (long2 - long1)
	delta.lat <- (lat2 - lat1)
	a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
	c <- 2 * asin(min(1,sqrt(a)))
	d = R * c
	return(d) # Distance in km
	}

	haversine.distance <- function(long1, lat1, long2, lat2) {
	long1 <- degrees.to.radians(long1)
	lat1 <- degrees.to.radians(lat1)
	long2 <- degrees.to.radians(long2)
	lat2 <- degrees.to.radians(lat2)

	return(haversine(long1, lat1, long2, lat2))
	}

	# function to generate graph data, a df of columns node1, node2, distance

	graph.explode <- function(clustered.df) {
	y <- data.frame(
	t(apply(
	combn(paste(clustered.df$id, clustered.df$lat, clustered.df$lon, sep=","), 2), 2,
	function(i){
	i.names <- unlist(strsplit(as.character(i), split='\\,'))
	latslongs <- c(as.numeric(i.names[2]),as.numeric(i.names[3]),
	as.numeric(i.names[5]),as.numeric(i.names[6]))
	return(c(i.names[1],i.names[4],
	haversine.distance(latslongs[1],latslongs[2],
	latslongs[3],latslongs[4])))
	})))
	# cast the weight to numeric
	y[,3] <- as.numeric(as.character(y[,3]))
	# to get a better graph, we are going to dump connections with longer
	# distances than half a mile (~.8 km)
	y <- y[y[,3] < .8,]
	return(y)
	}

	# now that we are ready, dump everything into an igraph,
	# extract the degree centrality, and return it out as a data.frame

	get.graph.data <- function(single.cluster) {
	cluster.graph2 <- graph.explode(single.cluster)
	g2 <- graph.data.frame(cluster.graph2)
	V(g2)$degcent <- centralization.degree(g2)$res

	q2 <- as.data.frame(as.numeric(as.matrix(V(g2)$name)))
	q2$v2<-as.matrix(V(g2)$degcent)

	names(q2) <- c('id','degcent')
	q2<-q2[order(q2$id, decreasing=TRUE),]
	output <- merge(q2, single.cluster, by="id")
	return(output)
	}

	# now we have to parse out closest nodes (by dist) to each highly
	# central node -- first we have to get the data about distances

	get.dists <- function(url) {
	dist.json <- fromJSON(url, method='C')
	dist.list <- as.data.frame(matrix(unlist(
	lapply(dist.json$results, function(i) {
	unlist(c(i$id,lapply(i$nearbyStations, function(j){unlist(cbind(j[1]))})))
	})), nrow=length(dist.json$results), byrow=TRUE))
	names(dist.list) <- c('id','close.one','close.two',
	'close.three','close.four','close.five')
	return(dist.list)
	}

	# now we have the graph data and the distances, so we have to
	# combine these two together to return a list of recommended
	# stations to repopulate

	make.recs <- function(graph.data, dists) {
	i <- 1
	close <- data.frame(check = numeric(0))
	results <- data.frame(id = numeric(0), name = character(0),
	available = numeric(0), total = numeric(0),
	stringsAsFactors=FALSE)
	comb <- merge(graph.data, dists, by="id")
	comb <- comb[order(comb$degcent, decreasing = T),]
	# oh no a loop! kill it with fire!
	# but seriously there's only like 300 total rows so i'm not
	# going to sweat this one too much. maybe it can be refactored
	# later
	while(nrow(results) < 4) {
	j <- comb[i,]
	close[nrow(close) + 1,] <- c(j$close.one)
	close[nrow(close) + 1,] <- c(j$close.two)
	close[nrow(close) + 1,] <- c(j$close.three)
	close[nrow(close) + 1,] <- c(j$close.four)
	close[nrow(close) + 1,] <- c(j$close.five)
	if(j$id %in% close$check) { NA }
	else { results[nrow(results) + 1,] <- c(j$id, j$name, j$available, j$total) }
	i <- i+1
	}
	return(results)
	}

	main <- function() {
	stations.tocluster <- stations.prep(stations.url)
	stations.cluster <- stations.kmeans(stations.tocluster)
	dists <- get.dists(dist.url)
	output <- data.frame(id = numeric(0), name = character(0),
	available = numeric(0), total = numeric(0),
	stringsAsFactors=FALSE)
	for(i in 1:4) {
	j <- stations.cluster[stations.cluster$clustered.cluster==i,]
	recs <- make.recs(get.graph.data(j), dists)
	output <- rbind(output,recs)
	}
	return(output)
	}

	c <- main()