Created
June 5, 2018 23:38
-
-
Save anonyth/d1ac7dde9fd7c2fb6457d1e377849ebf to your computer and use it in GitHub Desktop.
delay analysis of Trimet MAX routes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
#%%: import libraries | |
import pandas as pd | |
import matplotlib as plt | |
#%%: set increased ipython terminal width for local Spyder IDE | |
pd.set_option('display.width', 300) | |
typeface = {'fontname':'Helvetica'} | |
#%%: import full dataset for month timeframes | |
events_month = pd.read_csv('data/events_month.csv') | |
trips_month = pd.read_csv('data/trips_month.csv') | |
#%% some data dictionary references | |
# https://developer.trimet.org/definitions.shtml | |
# https://developer.trimet.org/ws_docs/arrivals2_ws.shtml | |
#%% look at an arbitrary tripID in isolation to help interpret dictionary | |
tr_6998908 = events_month.loc[(events_month['tripID'] == 6998908)].sort_values('event_timestamp') | |
tr_6998908.head(20) | |
#%% join trip and vehicle IDs into single df | |
delays = events_month.merge(trips_month, on=('tripID', 'vehicle_id')) | |
#%% re-order and drop columns of non-interest and non-negative delays | |
delays = delays[['delay', 'event_timestamp', 'tripID', 'vehicle_id', 'messageCode', 'routeNumber']] | |
delays = delays.loc[delays['delay'] < 0] | |
#%% assign MAX route names to numbers for friendlier results | |
max_routes = {90:'Red', 100:'Blue', 190:'Yellow', 200:'Green', 290:'Orange'} | |
delays['routeName'] = delays['routeNumber'].map(max_routes) | |
#%% return raw count of delays per MAX route, drop irrelevant columns | |
delays_count = delays.drop(axis=1, columns=['tripID', 'event_timestamp', 'vehicle_id', 'messageCode', 'routeNumber']) \ | |
.groupby(['routeName']).count() | |
#%% graph number of delays | |
delays_count_visual = delays_count.plot(kind='bar', figsize=(7,4)) | |
delays_count_visual.set(xlabel='MAX route', ylabel='recorded delays') | |
delays_count_visual.set_title('raw number of MAX delays per route in January 2017', y=1.02) | |
delays_count_visual.xaxis.labelpad = 12 | |
#%% return total seconds of delay per MAX route, drop irrelevant columns, get absolute value for plotting | |
delays_time = delays.drop(axis=1, columns=['tripID', 'event_timestamp', 'vehicle_id', 'messageCode', 'routeNumber']) \ | |
.groupby(['routeName']) \ | |
.delay.apply(lambda d: d.abs().sum()) | |
#%% graph duration of delays | |
delays_time_visual=delays_time.plot(kind='bar', figsize=(7,4)) | |
delays_time_visual.set(xlabel='MAX route', ylabel='seconds (millions)') | |
delays_time_visual.set_title('duration of MAX delays per route in January 2017', y=1.02) | |
delays_time_visual.xaxis.labelpad = 12 | |
#%% blend into one df with times delayed and total seconds | |
delays_total = pd.concat([delays_count, delays_time], axis=1) | |
delays_total.columns = ['times_delayed', 'seconds_delayed'] | |
#%% break out into minutes, hours, days for context | |
delays_total['minutes_delayed'] = delays_total['seconds_delayed'] / 60 | |
delays_total['hours_delayed'] = delays_total['minutes_delayed'] / 60 | |
delays_total['days_delayed'] = delays_total['hours_delayed'] / 24 | |
delays_total.head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Above should yield the following from
delays_total
dataframe: