Skip to content

Instantly share code, notes, and snippets.

@cameronlai
Created August 4, 2018 16:06
Show Gist options
  • Save cameronlai/3a2475e422fb1c26b74a91162a1daf54 to your computer and use it in GitHub Desktop.
Save cameronlai/3a2475e422fb1c26b74a91162a1daf54 to your computer and use it in GitHub Desktop.
Mapping bus data from data.gov.hk to GTFS format
import os
import re
import datetime
import pyodbc
from pyproj import Proj, transform
import zipfile
# File Directories
dirMDB = "./data_gov/"
output_folder = "./my-feed/"
# Coordinate Transform
WGS84 = Proj(init='EPSG:4326')
HK80 = Proj(init='EPSG:2326')
# DB related
conn_str_prefix = "Driver={Microsoft Access Driver (*.mdb)};DBQ="
delta_mins = 5
# from company_code.mdb to agency.txt
conn_str = conn_str_prefix + dirMDB + "COMPANY_CODE.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'agency.txt', 'w') as f:
f.write('agency_id,agency_name,agency_url,agency_timezone\n')
cur.execute(r"SELECT * FROM COMPANY_CODE")
companies = cur.fetchall()
for c in companies:
f.write('%s,%s,http://www.xxx.com,Asia/Shanghai\n' % (c.COMPANY_CODE, c.COMPANY_NAMEE))
# Load route db
# from rstop_bus.mdb to routes.txt
route_id_ary = []
conn_str = conn_str_prefix + dirMDB + "ROUTE_BUS.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'routes.txt', 'w') as f:
f.write('route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color\n')
cur.execute(r"SELECT DISTINCT ROUTE_ID, ROUTE_NAMEE, COMPANY_CODE, HYPERLINK_E FROM ROUTE")
route = cur.fetchall()
for r in route:
route_id_ary.append(r.ROUTE_ID)
f.write('%d,%s,,%s,,3,%s,,\n' % (r.ROUTE_ID, r.COMPANY_CODE, r.ROUTE_NAMEE, r.HYPERLINK_E))
# from rstop_bus.mdb to trips.txt
with open(output_folder + 'trips.txt', 'w') as f:
f.write('route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id\n')
for r in route:
for direction in range(2):
trip_id = '%d_%d' % (r.ROUTE_ID, direction)
f.write('%d,%s,%s,,%d,,\n' % (r.ROUTE_ID, 'FULLW', trip_id, direction))
# Get back rstop id
conn_str = conn_str_prefix + dirMDB + "RSTOP_BUS.mdb"
rstop_data = []
with pyodbc.connect(conn_str) as conn:
conn = pyodbc.connect(conn_str)
cur = conn.cursor()
execute_str = r"SELECT STOP_ID, MAX(STOP_NAMEE) FROM RSTOP GROUP BY STOP_ID"
cur.execute(execute_str)
rstop_data = cur.fetchall() # This includes stop id and field name
# from stop_bus.mdb to stops.txt
conn_str = conn_str_prefix + dirMDB + "STOP_BUS.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'stops.txt', 'w') as f:
f.write('stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url\n')
stop_count = 0
for rstop in rstop_data:
stop_id = rstop[0]
stop_namee = rstop[1]
stop_namee = stop_namee.replace(',', ' ')
stop_namee = re.sub(r'[^\x00-\x7f]',r' ',stop_namee)
cur.execute(r"SELECT STOP_ID, X, Y FROM STOP WHERE STOP_ID=%d" % stop_id)
stop = cur.fetchone()
x, y = transform(HK80, WGS84, stop.X, stop.Y)
f.write('%d,"%s",,%f,%f,%d,\n' % (stop_id, stop_namee, y, x, stop_count / 3))
stop_count += 1
# from stop_bus.mdb to stop_times.txt
conn_str = conn_str_prefix + dirMDB + "RSTOP_BUS.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'stop_times.txt', 'w') as f:
f.write('trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled\n')
start_time = datetime.datetime(2018,1,1,5,30,0)
for route_id in route_id_ary:
for direction in range(2):
trip_id = '%d_%d' % (route_id, direction)
stop_count = 0
execute_str = r"SELECT STOP_ID, STOP_SEQ FROM RSTOP WHERE ROUTE_ID = %d AND ROUTE_SEQ = %d ORDER BY STOP_SEQ" % (route_id, direction+1)
cur.execute(execute_str)
rstop_data = cur.fetchall()
for rstop in rstop_data:
offset = datetime.timedelta(minutes=5*stop_count)
arr_time = (start_time + offset).strftime('%H:%M:%S')
offset = datetime.timedelta(minutes=5*stop_count+1)
dep_time = (start_time + offset).strftime('%H:%M:%S')
f.write('%s,%s,%s,%d,%d,,,,\n' % (trip_id,arr_time,dep_time,rstop.STOP_ID,rstop.STOP_SEQ))
stop_count += 1
# calendar.txt
with open(output_folder + 'calendar.txt', 'w') as f:
f.write('service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date\n')
f.write('FULLW,1,1,1,1,1,1,1,20180101,20181231\n')
# calendar_dates.txt
with open(output_folder + 'calendar_dates.txt', 'w') as f:
f.write('service_id,date,exception_type\n')
f.write('FULLW,20070604,2\n')
# fare_attributes.txt
conn_str = conn_str_prefix + dirMDB + "FARE_BUS.mdb"
fare_list = []
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'fare_attributes.txt', 'w') as f:
f.write('fare_id,price,currency_type,payment_method,transfers,transfer_duration\n')
execute_str = r"SELECT DISTINCT PRICE FROM FARE"
cur.execute(execute_str)
fare_data = cur.fetchall()
for i, d in enumerate(fare_data):
f.write('%d,%f,HKD,0,0,\n' % (i, d.PRICE))
fare_list.append(d.PRICE)
# fare_rules.txt
conn_str = conn_str_prefix + dirMDB + "FARE_BUS.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'fare_rules.txt', 'w') as f:
f.write('fare_id,route_id,origin_id,destination_id,contains_id\n')
execute_str = r"SELECT ROUTE_ID, ROUTE_SEQ, ON_SEQ, OFF_SEQ, PRICE FROM FARE ORDER BY ROUTE_ID"
cur.execute(execute_str)
fare_data = cur.fetchall()
for d in fare_data:
fare_id = fare_list.index(d.PRICE)
f.write('%d,%s,%d,%d,\n' % (fare_id, d.ROUTE_ID, d.ON_SEQ, d.OFF_SEQ))
# Zip to output file
with zipfile.ZipFile('my-feed.zip', mode='w') as zf:
for f in os.listdir('my-feed'):
zf.write(output_folder + f, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment