Skip to content

Instantly share code, notes, and snippets.

@mwidjaja1
Last active December 26, 2017 22:45
Show Gist options
  • Save mwidjaja1/230f8cc38b4353decfee to your computer and use it in GitHub Desktop.
Save mwidjaja1/230f8cc38b4353decfee to your computer and use it in GitHub Desktop.
With NYC Subway Data from 2009-2011, I analyzed each subway line to derive conclusions regarding on-time performance & reliability.
""" Bokeh (NYC Subway) ---------------------------------------------------------
Goal: This script takes the NYC Subway data & parses it for visualization
techniques.
Input: http://web.mta.info/developers/performance.html has performance XML
data. We only cover subway and we only care for these metrics:
1. Subway wait assessment for all lines
Actual interval between trains
2. Mean Distance Between Failure
Miles until a train hits a mechanical failure causing a delay
3. On-Time Performance for each line, not the total (OTP)
Trains arriving within 5 minutes of its scheduled time.
4. Total Ridership
All customers riding, even though riding w. free transfers.
Each of those metrics consist of sub-metrics. We only care for:
MONTHLY_ACTUAL, MONTHLY_TARGET, PERIOD_MONTH, & PERIOD_YEAR
Output: We get CSV files (for the slope/growth of each statistic) and/or
HTML plots for each of the following categories:
1. Failure.html & subFailStats.csv:
Amount of failures across all lines in the subway system
2. OTP.html & subOTPStats.csv:
On time performance for each subway line in the system
3. OTPFamily.html:
#2 for each family of subway lines
4. OTPWait.html:
On time performance & wait time assessments for each subway line
5. OTPWaitFamily.html:
#4 but for each family of subway lines
6. Riders.html & subTRDStats.csv:
Amount of riders across all lines in the subway system
7. Wait.html & subWaitStats.csv:
Wait time assessment for each subway line in the system
8. WaitFamily.html
#7 for each family of subway lines
-----------------------------------------------------------------------------"""
import numpy as np
from bokeh.plotting import figure, output_file, show, gridplot
from sklearn import linear_model
import csv
import datetime as dt
import xmltodict
""" convertVal -----------------------------------------------------------------
Goal: This converts a (list of) value(s) to a float if possible.
From: Called from dictAdd
Input: (1) The list of values
Output: A converted list of values
-----------------------------------------------------------------------------"""
def convertVal(value):
try:
value = value.replace(',', '') # Removes commas from numbers
value = float(value)
except:
value = ''
return value
""" dictAdd --------------------------------------------------------------------
Goal: We are given a dictionary for each item (like a subway line). This
function takes each dictionary and appends it to a master dictionary
which contains data for all of the items (like every subway line).
If there are multiple dictionaries for each item, we'll append the
values to the master dictionary so that no values get deleted.
From: Called from main
Input: (1) A master dictionary (of all subway lines) where (2) a smaller
dictionary (of one subway line) will be added to.
We will save (4) certain keys of data to this smaller dictionary &
this smalelr dictionary will be saved to the larger dictionary under
(3) one key (i.e. subway line).
Output: A master dictionary which has the smaller dictionary appended.
-----------------------------------------------------------------------------"""
def dictAdd(masterDict, smallDict, oldLine, keys):
# Renames the subway line (i.e. key) so it only includes the subway line
# without any spaces
try:
line = oldLine.split('-')[1]
line = line.replace(' ', '')
except:
line = oldLine.replace(' ', '')
# The S Line is named inconsistently so this renames them
if 'S' in line:
if '42St' in line:
line = 'SLine42St'
elif 'Fkln' in line:
line = 'SLineFkln'
elif 'Rock' in line:
line = 'SLineRock'
else:
line = 'SLine'
# Test 1: Checks if this key type is in the dictionary. If so, we append
# this new value to the already existing value for this key type.
if line in masterDict:
tempDict = [] # Allocates list for existing values
for subKey in keys: # Loops through keys that we care about
tempDict = list(masterDict[line][subKey]) # Saves existing vals
newValue = convertVal(smallDict[subKey]) # Converts to float
tempDict.append(newValue) # Adds new val to existing vals
masterDict[line][subKey] = tempDict # Replaces existing vals
# Test 1: Else, we add these entire set of values to the dict.
else:
masterDict[line] = {} # Allocates dictionary for the key
for subKey in keys: # Saves each key in the 2nd dict to the 1st dict
newValue = convertVal(smallDict[subKey])
masterDict[line][subKey] = [newValue]
return masterDict
""" createDate -----------------------------------------------------------------
Goal: Combines the values of two keys in our dictionary. In our case,
we'll do this to combine the month + year column = date column.
Because of this, we assume that all of the values in the oldKeys
should be intergers.
From: Called from main
Input: (1) The dictionary of data with the (2) first key & (3) second key
which will be combined and saved as (4) a new key name.
Output: The dictionary of data where the two columns are added.
-----------------------------------------------------------------------------"""
def createDate(masterDict, oldKey1, oldKey2, newKey):
for line in masterDict:
# Gets the values for the first and second old key
old1 = [str(int(value)) for value in masterDict[line][oldKey1]]
old2 = [str(int(value)) for value in masterDict[line][oldKey2]]
# Combines the values of the first and second key
new1 = zip(old1,old2) # Creates tuples with old values 1 & 2
new2 = [] # Allocates list to convert tuple to float
for item in new1:
date = item[1] + '/' + item[0]
new2.append(dt.datetime.strptime(date, '%Y/%m'))
# Saves list as newKey to the masterDict
masterDict[line][newKey] = new2
return masterDict
""" bokehPlot ------------------------------------------------------------------
Goal: Makes a plot for the actual & target value per each month & year.
From: Called from main
Input: (1) The dictionary of data with the (2) x-axis key name & the
(3) y-axis key-name (can be a list) from said dictionary.
A plot will be saved onto an HTML file with a (4) pTitle,
(5) x-axis label, (6) y-axis label, & (7) the names of each data
point set for the legend (to skip the legend, set pyLabel=None)
The plot will be saved as a (8) file name (don't include extension)
in (9) a True = Grid fLayout or False = One Plot fLayout.
Output: A HTML plot for the actual and target values in the pwd.
-----------------------------------------------------------------------------"""
def bokehPlot(masterDict, xKey, yKey, pTitle, pxLabel, pyLabel, pLegend, fName, fGrid):
# Output to static HTML file
output_file(fName+".html", title=pTitle)
# Creates variables
status = False # Set to true once we found data in the given key
pList = [] # A list of plots, used primarily if fLayout = True
# Loops through each subway line in masterDict
for line in sorted(masterDict):
# Create a new plot with a pTitle and axis labels
if fGrid:
p = figure(title=pTitle+' '+line, width=525, plot_height=350,
x_axis_label=pxLabel, x_axis_type = "datetime",
y_axis_label=pyLabel, title_text_font_size='14pt')
else:
p = figure(title=pTitle+' '+line, x_axis_label=pxLabel,
x_axis_type = "datetime", y_axis_label=pyLabel)
# Creates a circle plot to the figure
p, status = circlePlot(p, masterDict, line, xKey, yKey, pLegend, status)
# Append this plot to the list of all plots
if status:
p.left[0].formatter.use_scientific = False # Removes sci notation
pList.append(p) # Saves data to pList
# Plots Results Pt 1: If fLayout=True, we prepare the plots to a grid fLayout
if fGrid:
pPlot = gridfLayout(pList,2)
# Plots Results Pt 2: If fLayout=False, we do only one plot
else:
pPlot = pList[0]
# Shows plot
show(pPlot)
""" circlePlot -----------------------------------------------------------------
Goal: Creates a circle plot from a dictionary of data named after some key
(i.e. line) and aligns it to an X-Axis
From: Called from bokehPlot
Input: (1) The current figure which is being worked on.
(2) The master dictionary of data with the (3) key of the smaller
dictionary (i.e. current subway line) which is being evaluated in
the master dict.
We will look in the masterDict[line] for a (4) x-axis key name &
(5) y-axis key-names (yKey can be given as a list).
This line will be saved on the pLeg as (6) pLeg. Finally, we
provide a (7) status if we found data.
Output: A list of lists
-----------------------------------------------------------------------------"""
def circlePlot(p, masterDict, line, xKey, yKey, pLegend, status):
clr = ['blue', 'green', 'orange', 'red'] # Colors to plot with
# Loops between each yKey key name
for idx,subKey in enumerate(yKey):
if subKey in masterDict[line]: # Plots if subKey is in our dict
status = True
x = masterDict[line][xKey] # Creates list of xKey values
y = masterDict[line][subKey] # Creates list of yKey values
if pLegend is not None:
p.circle(x,y, legend=pLegend[idx], color=clr[idx], line_width=2)
else:
p.circle(x,y, color=clr[idx], line_width=2)
return p, status
""" gridfLayout ----------------------------------------------------------------
Goal: Takes a list of Bokeh Plots and creates a list of lists out of them
so that the list becomes a two-dimensional list, each sub-list with
as many values as there are plots in one row.
From: Called from bokehPlot
Input: (1) The list of plots and (2) how many columns should be made.
Output: A list of lists
-----------------------------------------------------------------------------"""
def gridfLayout(pList, col):
pLists = [] # The list of lists
# Loops through the list of plots for every 'col'th iteration
for i in range(1,len(pList),col):
pTemp = [] # Each 'row' or 'dimension' for the list of lists
# Loops through the 1st-'col'th plot in each row or dimension
# We attempt to append said plot to the row or dimension
for j in range(i,i+col,1):
try:
pTemp.append(pList[j])
except:
continue
# Appends each row or dimension to the list of lists
pLists.append(pTemp)
# Creates gridplot based on the list of Lists and returns it
return gridplot(pLists)
""" combineDict ----------------------------------------------------------------
Goal: Combines two dictionaries and merges them based on their xKey.
From: Called from main
Input: (1) The first and (2) second dictionary which will be merged based
on their (3) xKey key name. In the (4) list of yKey key names,
the 'MONTHLY' in each key's name will be replaced by a string for
(5) the first and (6) the second dict.
Output: The combined dictionary
-----------------------------------------------------------------------------"""
def combineDict(inDict1, inDict2, yKey, xKey, name1, name2):
masterDict = {}
# Loops between each key in the first dictionary
for line in inDict1:
# Only saves data if the data is part of an actual subway line
try:
masterDict[line] = {} # Creates entry for subway line
for subKey in inDict1[line]: # Loops between keys in subway lines
subKeyNew = subKey.replace('MONTHLY',name1)
masterDict[line][subKeyNew] = inDict1[line][subKey]
except:
continue
# Loops between each key in the second dictionary
for line in inDict2:
# Only saves data if the data is part of an actual subway line
try:
for subKey in inDict2[line]: # Loops between keys in subway lines
subKeyNew = subKey.replace('MONTHLY',name2)
masterDict[line][subKeyNew] = inDict2[line][subKey]
except:
continue
return masterDict
""" combineLine ----------------------------------------------------------------
Goal: Combines many lines from one dictionary and merges them based on
their xKey. This assumes that each family of line has data recorded
over the same period of time
From: Called from main
Input: (1) The first dictionary which will be merged over the (2) lists
of subway lines which should be merged together (this should be a
stacked list where the lowest list is the list of all subway lines
that should be merged) and the average of each line's (3) key name.
The (4) x-axis' key name should be the same across both lines & thus
we will copy the longest list of x-axis key names.
Output: The combined dictionary where we have keys only for unified lines
-----------------------------------------------------------------------------"""
def combineLine(inDict, familyList, yKey, xKey):
familyDict = {}
# Loops between groups of family
for family in familyList:
valuesList = [] # List of all yKey (MONTHLY_ACTUAL) values
averageList = [] # List of the averaged values from valuesList
linesIdxList = range(0,len(familyList[family]))
# Loops between lines. We append the yKey's values to a list and save
# the xKey values as a list.
for line in familyList[family]:
valuesList.append(list(inDict[line][yKey]))
dateRange = inDict[line][xKey]
# First loop loops between each 'value' in a subway line
# Second loop loops between each month/year. We loads each value from a
# date across all subway lines in a family.
for valIdx, DoNotUse in enumerate(valuesList[0]):
tempList = [valuesList[lineIdx][valIdx] for lineIdx in linesIdxList]
try: # Averages the values across all lines of one date
averageList.append(np.mean(tempList))
except: # If no value for a date was found, we append ''
averageList.append(tempList)
# Saves averages and dates to the familyDict. The key is the family of
# suwbay lines.
familyDict[family] = {yKey: averageList, xKey: dateRange}
return familyDict
""" lineStats ------------------------------------------------------------------
Goal: Performs a linear regression for each 'line' in the master
dictionary to return its slope and r2 value.
From: Called from main
Input: (1) The master dictionary, the (2) single key of data to analyze,
(3) and True=LogRegression or False=LinearRegression
Output: A dictionary with the slope, r2 value, and the last data point (that
is, the most recent data point obtained)
-----------------------------------------------------------------------------"""
def lineStats(masterDict, yKey, logReg):
outData = {} # Will hold a dictionary with line: (slope, r_value)
# Loops between each subway's line in masterDict
for line in masterDict:
outData[line] = {} # Append line to outData
# Loads the data for a given line & key from masterDict if it exists
try:
yOld = masterDict[line][yKey]
except:
yOld = [1]
yData = [1]
# Loops between the items & removes those that isn't '' from
yData = []
for item in yOld:
if item is not '':
yData.append(item)
# We can't calculate statistics where the x-axis is a datetime.
# Here, we convert all the x points to a simple unique integer by range
xData = range(1,len(yData)+1)
# We take the most recent data point
lastPoint = float(yData[-1])
# Convert the xData & yData lists to Numpy Arrays
xData = np.reshape(np.array(xData),(len(xData),1))
if not logReg:
yData = np.reshape(np.array(yData),(len(yData),1))
else:
yData = np.reshape(np.array(yData),(len(yData),))
# Creates the regression model and fits the dataset
if not logReg:
clf = linear_model.LinearRegression()
else:
clf = linear_model.LogisticRegression()
clf.fit(xData, yData)
# Obtains coefficient/slope & R2/accuracy score of model
if not logReg:
slope = float(clf.coef_)
r2 = float(clf.score(xData, yData))
else:
slope = clf.coef_
r2 = 1
# Saves slope, RSquared value, & the last data point to outData
outData[line][yKey] = (slope, r2, lastPoint)
return outData
""" Main Function: Loads all Values -----------------------------------------"""
# Accept File as Input Argument
inPath = '/Users/Matthew/Dropbox/Academics/AdvanceDS/5_Bokeh/Performance_NYCT.xml'
inFile = open(inPath,'r')
# List of keys we'll plot over for our xKey, yKey, & total keys
xKey = ['PERIOD_MONTH', 'PERIOD_YEAR']
yKey = ['MONTHLY_ACTUAL', 'MONTHLY_TARGET']
subKey = xKey + yKey
# Allocates Tuples for XML file import
xmlFile = ''
waitData = ()
# Allocates Dictionary to save data for different categories
subWait = {}
subFail = {}
subOTP = {}
subTRD = {}
# Read XML File
for line in inFile:
xmlFile = xmlFile + line
# Parse XML File into Dictionary
xmlDict = xmltodict.parse(xmlFile, process_namespaces=True)
# This Dictionary will probably have many incidents where one row contains all
# the data since the XML file was formatted strangely. This part of the code
# loops through the dictionary until we get the dictionary containing all data.
while len(xmlDict) == 1:
for key in xmlDict:
xmlDict = xmlDict[key]
""" Main Function: Saves data to a master dictionary ---------------------------
Each data type is saved into its own dict as {line: {month: x, year: y,}}
-----------------------------------------------------------------------------"""
# Scans through XML file & saves values to a master dictionary
for entry in xmlDict:
row = entry
# Scans through each entry (i.e. Subway line & metric)
for item in row:
try:
# If an entry matches one of these conditions, we save it
if 'Subway Wait Assessment' in row[item]:
subWait = dictAdd(subWait, row, row['INDICATOR_NAME'], subKey)
elif 'Mean Distance Between Failures - Subways' in row[item]:
subFail = dictAdd(subFail, row, row['INDICATOR_NAME'], subKey)
elif 'OTP (Terminal)' in row[item]:
subOTP = dictAdd(subOTP, row, row['INDICATOR_NAME'], subKey)
elif 'Total Ridership - Subways' in row[item]:
subTRD = dictAdd(subTRD, row, row['INDICATOR_NAME'], subKey)
except:
continue
""" Main Function: Modifies data in the master dictionary -------------------"""
# Combines the month & year column into a date column
subWait = createDate(subWait, xKey[0], xKey[1], 'PERIOD_DATE')
subFail = createDate(subFail, xKey[0], xKey[1], 'PERIOD_DATE')
subOTP = createDate(subOTP, xKey[0], xKey[1], 'PERIOD_DATE')
subTRD = createDate(subTRD, xKey[0], xKey[1], 'PERIOD_DATE')
# Sets standard key and label for the xAxis
xKey = 'PERIOD_DATE'
pxLabel = 'Date'
""" Main Function: Plots each data set to an HTML file ----------------------"""
# Creates a plot for each of the conditions for actual vs. target goals
bokehPlot(subFail, xKey, yKey, 'Average Miles before Failure', pxLabel, 'Miles',
['Actual Miles', 'Target Miles'], 'Failure', False)
bokehPlot(subTRD, xKey, yKey, 'Total Ridership', pxLabel, 'Riders',
['Actual Riders', 'Target Riders'], 'Riders', False)
bokehPlot(subWait, xKey, yKey, 'Average Wait Time', pxLabel, 'Percent',
['Actual Wait Time', 'Target Wait Time'], 'Wait', True)
bokehPlot(subOTP, xKey, yKey, 'On Time Rate', pxLabel,'Percent',
['Actual OTP', 'Target OTP'], 'OTP', True)
""" Main Function: Combines multiple data sets for plotting -----------------"""
# Combines the OTP and Wait data dictionaries & plots it
subOTPWait = combineDict(subOTP, subWait, xKey, yKey, 'OTP', 'WAIT')
bokehPlot(subOTPWait, xKey, ['WAIT_ACTUAL', 'OTP_ACTUAL'], 'Wait vs. OTP',
pxLabel, 'Percent', ['Wait', 'OTP'], 'OTPWait', True)
""" Main Function: Combines multiple subway lines for plotting --------------"""
# Creates 'families' of subway lines
familyList = {'ACE': ['ALine', 'CLine', 'ELine'],
'BDFM': ['BLine', 'DLine', 'FLine', 'MLine'], 'G': ['GLine'],
'L': ['LLine'], 'JZ': ['JZLine'],
'NQR': ['NLine', 'QLine','RLine'],
'123': ['1Line', '2Line', '3Line'],
'456': ['4Line', '5Line', '6Line'],
'7': ['7Line'], 'SFkln': ['SLineFkln'],
'S42': ['SLine42St'], 'SRock': ['SLineRock']}
# Combines 'families' of subway lines for plotting
subOTPFamily = combineLine(subOTP, familyList, 'MONTHLY_ACTUAL', xKey)
subWaitFamily = combineLine(subWait, familyList, 'MONTHLY_ACTUAL', xKey)
# Plots 'families' from the above data set
bokehPlot(subWaitFamily, xKey, yKey, 'Average Wait Time (Families)', pxLabel,
'Percent', ['Actual Wait Time', 'Target Wait Time'], 'WaitFamily',
True)
bokehPlot(subOTPFamily, xKey, yKey, 'On Time Rate (Families)', pxLabel,
'Percent', ['Actual OTP', 'Target OTP'], 'OTPFamily', True)
""" Main Function: Combines multiple data sets & subway lines for plotting --"""
# Combines the OTP and Wait data dictionaries
subOTPWaitFamily = combineDict(subOTPFamily, subWaitFamily, xKey, yKey,
'OTP', 'WAIT')
# Plots the combined dataset dictionary
bokehPlot(subOTPWaitFamily, xKey, ['WAIT_ACTUAL', 'OTP_ACTUAL'],
'Wait vs. OTP (Families)', pxLabel, 'Percent', ['Wait', 'OTP'],
'OTPWaitFamily', True)
""" Main Function: Calculates slope statistics for each line ----------------"""
# Calculates Slope & the last MONTHLY_ACTUAL value for subOTP & subWait
subTRDStats = lineStats(subTRD, 'MONTHLY_ACTUAL', False)
subOTPStats = lineStats(subOTP, 'MONTHLY_ACTUAL', False)
subWaitStats = lineStats(subWait, 'MONTHLY_ACTUAL', False)
subFailStats = lineStats(subFail, 'MONTHLY_ACTUAL', False)
# Creates CSV Files for the Statistics
with open(('subOTPStats.csv'), 'wt') as f:
csv_writer = csv.writer(f, delimiter=',')
csv_writer.writerows(sorted(subOTPStats.items()))
with open(('subWaitStats.csv'), 'wt') as f:
csv_writer = csv.writer(f, delimiter=',')
csv_writer.writerows(sorted(subWaitStats.items()))
with open(('subTRDStats.csv'), 'wt') as f:
csv_writer = csv.writer(f, delimiter=',')
csv_writer.writerows(sorted(subTRDStats.items()))
with open(('subFailStats.csv'), 'wt') as f:
csv_writer = csv.writer(f, delimiter=',')
csv_writer.writerows(sorted(subFailStats.items()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment