Script to collect and get routes for personal Citibike data
## API Structure | |
# { | |
# "name": "citibike_bay", | |
# "count": 8, | |
# "frequency": "realtime", | |
# "version": 1, | |
# "newdata": true, | |
# "lastrunstatus": "success", | |
# "lastsuccess": "Tue Jul 22 2014 21:50:19 GMT+0000 (UTC)", | |
# "results": { | |
# "collection1": [ | |
# { | |
# "Start_Station": "E 25 St & 2 Ave", | |
# "Start_Time": "8/31/13 4:30:54 PM", | |
# "End_Station": "Broadway & W 24 St", | |
# "End_Time": "8/31/13 4:39:06 PM" | |
# }, | |
# .... | |
# ] | |
# } | |
# } | |
## Required gems | |
require 'rest_client' | |
require 'pp' | |
require 'json' | |
require 'csv' | |
require 'time' | |
# require 'date' | |
## Keys | |
KIMONO_API = '' | |
KIMONO_API_KEY = '' | |
KIMONO_API_BEARER = '' | |
GOOGLE_API_KEY = '' | |
CSV_NAME = 'trips.csv' | |
STATIONS_DOC = 'stations.json' | |
NUMBER_PAGES_CITIBIKE_DATA = 13 | |
## Pass in Citibike time as a string | |
## Return string as a Time object | |
def create_time_stamp(time) | |
format = "%m/%d/%y %I:%M:%S %p" | |
time = time.gsub(/\d+/) {|s| "%02d" % s.to_i} | |
time = Time.strptime(time, format) | |
return time | |
end | |
def create_stations_hash ( doc_name ) | |
## Read in and parse the list of stations | |
stations_list = JSON.parse( IO.read( 'stations.json' ) ) | |
stations_list = stations_list["stationBeanList"] | |
stations_hash = {} | |
## Take the stations list and map station names to lat / lon | |
for i in 0..(stations_list.length-1) | |
station = stations_list[i] | |
name = station["stationName"] | |
lat = station["latitude"] | |
lon = station["longitude"] | |
stations_hash[name] = [lat,lon] | |
end | |
return stations_hash | |
end | |
def catch_errors( station ) | |
## Manually fix an error in Citibike station naming | |
error_stations = ["E 47 St & 2 Ave", "Lawrence St & Willoughby St"] | |
corrected_stations = ["Greenwich Ave & Charles St", "Lafayette St & Jersey St"] | |
for i in 0..(error_stations.length-1) | |
if station == error_stations[i] | |
station = corrected_stations[i] | |
end | |
end | |
return station | |
end | |
stations_hash = create_stations_hash(STATIONS_DOC) | |
## Set up the csv with column titles | |
CSV.open("#{CSV_NAME}", 'a') do |f| | |
f << ["start_station", "start_station_lat", "start_station_lon", "start_time", "end_station", "end_station_lat", "end_station_lon", "end_time", "actual_duration", "estimated_duration", "estimated_distance"] | |
end | |
## Iterate over all the pages of trip data on citibike's website | |
for p in 1..NUMBER_PAGES_CITIBIKE_DATA | |
pp "Collecting Page: #{p}" | |
## Get the data from the API and subset by the results | |
response = RestClient.get("https://www.kimonolabs.com/api/#{KIMONO_API}?apikey=#{KIMONO_API_KEY}&kimpath3=#{p}", {'authorization' => "Bearer #{KIMONO_API_BEARER}"}); | |
response = JSON.parse(response) | |
results = response["results"]["collection1"] | |
num = results.length | |
for i in 0..(num-1) | |
begin | |
## Get the data wanted | |
trip = results[i] | |
start_station = trip["Start_Station"] | |
start_time = trip["Start_Time"] | |
end_station = trip["End_Station"] | |
end_time = trip["End_Time"] | |
## Catch naming errors in Citibike data | |
end_station = catch_errors(end_station) | |
start_station = catch_errors(start_station) | |
## Calculate the actual trip time | |
start_time_stamp = create_time_stamp(start_time) | |
end_time_stamp = create_time_stamp(end_time) | |
actual_duration = end_time_stamp - start_time_stamp | |
## Turn station names into geocodes | |
start_station_geocode = stations_hash[start_station] | |
end_station_geocode = stations_hash[end_station] | |
start_station_lat = start_station_geocode[0] | |
start_station_lon = start_station_geocode[1] | |
end_station_lat = end_station_geocode[0] | |
end_station_lon = end_station_geocode[1] | |
## Get Google Estimates For Cycling Time and Distance | |
estimated_distance = 'NA' | |
estimated_duration = 'NA' | |
begin | |
response = RestClient.get("https://maps.googleapis.com/maps/api/distancematrix/json?origins=#{start_station_lat},#{start_station_lon}&destinations=#{end_station_lat},#{end_station_lon}&key=#{GOOGLE_API_KEY}&mode=bicycling") | |
response = JSON.parse(response) | |
data = response['rows'][0]['elements'][0] | |
estimated_distance = data['distance']['value'] | |
estimated_duration = data['duration']['value'] | |
rescue | |
pp "Google Geocoding Eroor" | |
end | |
## Open the CSV and save the data | |
CSV.open("#{CSV_NAME}", 'a') do |f| | |
f << [start_station, start_station_lat, start_station_lon, start_time, end_station, end_station_lat, end_station_lon, end_time, actual_duration, estimated_duration, estimated_distance] | |
end | |
## Catch an error if the row is missing a station etc. | |
rescue | |
pp "NOT COMPLETE TRIP DATA" | |
end | |
end | |
## Sometimes there are throtelling issues with Kimono | |
## This is just a safety that solves this | |
sleep 20 | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment