Skip to content

Instantly share code, notes, and snippets.

@cameronlai
Created August 4, 2018 16:06
Embed
What would you like to do?
Mapping bus data from data.gov.hk to GTFS format
import os
import re
import datetime
import pyodbc
from pyproj import Proj, transform
import zipfile
# File Directories
dirMDB = "./data_gov/"
output_folder = "./my-feed/"
# Coordinate Transform
WGS84 = Proj(init='EPSG:4326')
HK80 = Proj(init='EPSG:2326')
# DB related
conn_str_prefix = "Driver={Microsoft Access Driver (*.mdb)};DBQ="
delta_mins = 5
# from company_code.mdb to agency.txt
conn_str = conn_str_prefix + dirMDB + "COMPANY_CODE.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'agency.txt', 'w') as f:
f.write('agency_id,agency_name,agency_url,agency_timezone\n')
cur.execute(r"SELECT * FROM COMPANY_CODE")
companies = cur.fetchall()
for c in companies:
f.write('%s,%s,http://www.xxx.com,Asia/Shanghai\n' % (c.COMPANY_CODE, c.COMPANY_NAMEE))
# Load route db
# from rstop_bus.mdb to routes.txt
route_id_ary = []
conn_str = conn_str_prefix + dirMDB + "ROUTE_BUS.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'routes.txt', 'w') as f:
f.write('route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color\n')
cur.execute(r"SELECT DISTINCT ROUTE_ID, ROUTE_NAMEE, COMPANY_CODE, HYPERLINK_E FROM ROUTE")
route = cur.fetchall()
for r in route:
route_id_ary.append(r.ROUTE_ID)
f.write('%d,%s,,%s,,3,%s,,\n' % (r.ROUTE_ID, r.COMPANY_CODE, r.ROUTE_NAMEE, r.HYPERLINK_E))
# from rstop_bus.mdb to trips.txt
with open(output_folder + 'trips.txt', 'w') as f:
f.write('route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id\n')
for r in route:
for direction in range(2):
trip_id = '%d_%d' % (r.ROUTE_ID, direction)
f.write('%d,%s,%s,,%d,,\n' % (r.ROUTE_ID, 'FULLW', trip_id, direction))
# Get back rstop id
conn_str = conn_str_prefix + dirMDB + "RSTOP_BUS.mdb"
rstop_data = []
with pyodbc.connect(conn_str) as conn:
conn = pyodbc.connect(conn_str)
cur = conn.cursor()
execute_str = r"SELECT STOP_ID, MAX(STOP_NAMEE) FROM RSTOP GROUP BY STOP_ID"
cur.execute(execute_str)
rstop_data = cur.fetchall() # This includes stop id and field name
# from stop_bus.mdb to stops.txt
conn_str = conn_str_prefix + dirMDB + "STOP_BUS.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'stops.txt', 'w') as f:
f.write('stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url\n')
stop_count = 0
for rstop in rstop_data:
stop_id = rstop[0]
stop_namee = rstop[1]
stop_namee = stop_namee.replace(',', ' ')
stop_namee = re.sub(r'[^\x00-\x7f]',r' ',stop_namee)
cur.execute(r"SELECT STOP_ID, X, Y FROM STOP WHERE STOP_ID=%d" % stop_id)
stop = cur.fetchone()
x, y = transform(HK80, WGS84, stop.X, stop.Y)
f.write('%d,"%s",,%f,%f,%d,\n' % (stop_id, stop_namee, y, x, stop_count / 3))
stop_count += 1
# from stop_bus.mdb to stop_times.txt
conn_str = conn_str_prefix + dirMDB + "RSTOP_BUS.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'stop_times.txt', 'w') as f:
f.write('trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled\n')
start_time = datetime.datetime(2018,1,1,5,30,0)
for route_id in route_id_ary:
for direction in range(2):
trip_id = '%d_%d' % (route_id, direction)
stop_count = 0
execute_str = r"SELECT STOP_ID, STOP_SEQ FROM RSTOP WHERE ROUTE_ID = %d AND ROUTE_SEQ = %d ORDER BY STOP_SEQ" % (route_id, direction+1)
cur.execute(execute_str)
rstop_data = cur.fetchall()
for rstop in rstop_data:
offset = datetime.timedelta(minutes=5*stop_count)
arr_time = (start_time + offset).strftime('%H:%M:%S')
offset = datetime.timedelta(minutes=5*stop_count+1)
dep_time = (start_time + offset).strftime('%H:%M:%S')
f.write('%s,%s,%s,%d,%d,,,,\n' % (trip_id,arr_time,dep_time,rstop.STOP_ID,rstop.STOP_SEQ))
stop_count += 1
# calendar.txt
with open(output_folder + 'calendar.txt', 'w') as f:
f.write('service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date\n')
f.write('FULLW,1,1,1,1,1,1,1,20180101,20181231\n')
# calendar_dates.txt
with open(output_folder + 'calendar_dates.txt', 'w') as f:
f.write('service_id,date,exception_type\n')
f.write('FULLW,20070604,2\n')
# fare_attributes.txt
conn_str = conn_str_prefix + dirMDB + "FARE_BUS.mdb"
fare_list = []
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'fare_attributes.txt', 'w') as f:
f.write('fare_id,price,currency_type,payment_method,transfers,transfer_duration\n')
execute_str = r"SELECT DISTINCT PRICE FROM FARE"
cur.execute(execute_str)
fare_data = cur.fetchall()
for i, d in enumerate(fare_data):
f.write('%d,%f,HKD,0,0,\n' % (i, d.PRICE))
fare_list.append(d.PRICE)
# fare_rules.txt
conn_str = conn_str_prefix + dirMDB + "FARE_BUS.mdb"
with pyodbc.connect(conn_str) as conn:
cur = conn.cursor()
with open(output_folder + 'fare_rules.txt', 'w') as f:
f.write('fare_id,route_id,origin_id,destination_id,contains_id\n')
execute_str = r"SELECT ROUTE_ID, ROUTE_SEQ, ON_SEQ, OFF_SEQ, PRICE FROM FARE ORDER BY ROUTE_ID"
cur.execute(execute_str)
fare_data = cur.fetchall()
for d in fare_data:
fare_id = fare_list.index(d.PRICE)
f.write('%d,%s,%d,%d,\n' % (fare_id, d.ROUTE_ID, d.ON_SEQ, d.OFF_SEQ))
# Zip to output file
with zipfile.ZipFile('my-feed.zip', mode='w') as zf:
for f in os.listdir('my-feed'):
zf.write(output_folder + f, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment