Last active
December 26, 2017 22:45
-
-
Save mwidjaja1/230f8cc38b4353decfee to your computer and use it in GitHub Desktop.
With NYC Subway Data from 2009-2011, I analyzed each subway line to derive conclusions regarding on-time performance & reliability.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Bokeh (NYC Subway) --------------------------------------------------------- | |
Goal: This script takes the NYC Subway data & parses it for visualization | |
techniques. | |
Input: http://web.mta.info/developers/performance.html has performance XML | |
data. We only cover subway and we only care for these metrics: | |
1. Subway wait assessment for all lines | |
Actual interval between trains | |
2. Mean Distance Between Failure | |
Miles until a train hits a mechanical failure causing a delay | |
3. On-Time Performance for each line, not the total (OTP) | |
Trains arriving within 5 minutes of its scheduled time. | |
4. Total Ridership | |
All customers riding, even though riding w. free transfers. | |
Each of those metrics consist of sub-metrics. We only care for: | |
MONTHLY_ACTUAL, MONTHLY_TARGET, PERIOD_MONTH, & PERIOD_YEAR | |
Output: We get CSV files (for the slope/growth of each statistic) and/or | |
HTML plots for each of the following categories: | |
1. Failure.html & subFailStats.csv: | |
Amount of failures across all lines in the subway system | |
2. OTP.html & subOTPStats.csv: | |
On time performance for each subway line in the system | |
3. OTPFamily.html: | |
#2 for each family of subway lines | |
4. OTPWait.html: | |
On time performance & wait time assessments for each subway line | |
5. OTPWaitFamily.html: | |
#4 but for each family of subway lines | |
6. Riders.html & subTRDStats.csv: | |
Amount of riders across all lines in the subway system | |
7. Wait.html & subWaitStats.csv: | |
Wait time assessment for each subway line in the system | |
8. WaitFamily.html | |
#7 for each family of subway lines | |
-----------------------------------------------------------------------------""" | |
import numpy as np | |
from bokeh.plotting import figure, output_file, show, gridplot | |
from sklearn import linear_model | |
import csv | |
import datetime as dt | |
import xmltodict | |
""" convertVal ----------------------------------------------------------------- | |
Goal: This converts a (list of) value(s) to a float if possible. | |
From: Called from dictAdd | |
Input: (1) The list of values | |
Output: A converted list of values | |
-----------------------------------------------------------------------------""" | |
def convertVal(value): | |
try: | |
value = value.replace(',', '') # Removes commas from numbers | |
value = float(value) | |
except: | |
value = '' | |
return value | |
""" dictAdd -------------------------------------------------------------------- | |
Goal: We are given a dictionary for each item (like a subway line). This | |
function takes each dictionary and appends it to a master dictionary | |
which contains data for all of the items (like every subway line). | |
If there are multiple dictionaries for each item, we'll append the | |
values to the master dictionary so that no values get deleted. | |
From: Called from main | |
Input: (1) A master dictionary (of all subway lines) where (2) a smaller | |
dictionary (of one subway line) will be added to. | |
We will save (4) certain keys of data to this smaller dictionary & | |
this smalelr dictionary will be saved to the larger dictionary under | |
(3) one key (i.e. subway line). | |
Output: A master dictionary which has the smaller dictionary appended. | |
-----------------------------------------------------------------------------""" | |
def dictAdd(masterDict, smallDict, oldLine, keys): | |
# Renames the subway line (i.e. key) so it only includes the subway line | |
# without any spaces | |
try: | |
line = oldLine.split('-')[1] | |
line = line.replace(' ', '') | |
except: | |
line = oldLine.replace(' ', '') | |
# The S Line is named inconsistently so this renames them | |
if 'S' in line: | |
if '42St' in line: | |
line = 'SLine42St' | |
elif 'Fkln' in line: | |
line = 'SLineFkln' | |
elif 'Rock' in line: | |
line = 'SLineRock' | |
else: | |
line = 'SLine' | |
# Test 1: Checks if this key type is in the dictionary. If so, we append | |
# this new value to the already existing value for this key type. | |
if line in masterDict: | |
tempDict = [] # Allocates list for existing values | |
for subKey in keys: # Loops through keys that we care about | |
tempDict = list(masterDict[line][subKey]) # Saves existing vals | |
newValue = convertVal(smallDict[subKey]) # Converts to float | |
tempDict.append(newValue) # Adds new val to existing vals | |
masterDict[line][subKey] = tempDict # Replaces existing vals | |
# Test 1: Else, we add these entire set of values to the dict. | |
else: | |
masterDict[line] = {} # Allocates dictionary for the key | |
for subKey in keys: # Saves each key in the 2nd dict to the 1st dict | |
newValue = convertVal(smallDict[subKey]) | |
masterDict[line][subKey] = [newValue] | |
return masterDict | |
""" createDate ----------------------------------------------------------------- | |
Goal: Combines the values of two keys in our dictionary. In our case, | |
we'll do this to combine the month + year column = date column. | |
Because of this, we assume that all of the values in the oldKeys | |
should be intergers. | |
From: Called from main | |
Input: (1) The dictionary of data with the (2) first key & (3) second key | |
which will be combined and saved as (4) a new key name. | |
Output: The dictionary of data where the two columns are added. | |
-----------------------------------------------------------------------------""" | |
def createDate(masterDict, oldKey1, oldKey2, newKey): | |
for line in masterDict: | |
# Gets the values for the first and second old key | |
old1 = [str(int(value)) for value in masterDict[line][oldKey1]] | |
old2 = [str(int(value)) for value in masterDict[line][oldKey2]] | |
# Combines the values of the first and second key | |
new1 = zip(old1,old2) # Creates tuples with old values 1 & 2 | |
new2 = [] # Allocates list to convert tuple to float | |
for item in new1: | |
date = item[1] + '/' + item[0] | |
new2.append(dt.datetime.strptime(date, '%Y/%m')) | |
# Saves list as newKey to the masterDict | |
masterDict[line][newKey] = new2 | |
return masterDict | |
""" bokehPlot ------------------------------------------------------------------ | |
Goal: Makes a plot for the actual & target value per each month & year. | |
From: Called from main | |
Input: (1) The dictionary of data with the (2) x-axis key name & the | |
(3) y-axis key-name (can be a list) from said dictionary. | |
A plot will be saved onto an HTML file with a (4) pTitle, | |
(5) x-axis label, (6) y-axis label, & (7) the names of each data | |
point set for the legend (to skip the legend, set pyLabel=None) | |
The plot will be saved as a (8) file name (don't include extension) | |
in (9) a True = Grid fLayout or False = One Plot fLayout. | |
Output: A HTML plot for the actual and target values in the pwd. | |
-----------------------------------------------------------------------------""" | |
def bokehPlot(masterDict, xKey, yKey, pTitle, pxLabel, pyLabel, pLegend, fName, fGrid): | |
# Output to static HTML file | |
output_file(fName+".html", title=pTitle) | |
# Creates variables | |
status = False # Set to true once we found data in the given key | |
pList = [] # A list of plots, used primarily if fLayout = True | |
# Loops through each subway line in masterDict | |
for line in sorted(masterDict): | |
# Create a new plot with a pTitle and axis labels | |
if fGrid: | |
p = figure(title=pTitle+' '+line, width=525, plot_height=350, | |
x_axis_label=pxLabel, x_axis_type = "datetime", | |
y_axis_label=pyLabel, title_text_font_size='14pt') | |
else: | |
p = figure(title=pTitle+' '+line, x_axis_label=pxLabel, | |
x_axis_type = "datetime", y_axis_label=pyLabel) | |
# Creates a circle plot to the figure | |
p, status = circlePlot(p, masterDict, line, xKey, yKey, pLegend, status) | |
# Append this plot to the list of all plots | |
if status: | |
p.left[0].formatter.use_scientific = False # Removes sci notation | |
pList.append(p) # Saves data to pList | |
# Plots Results Pt 1: If fLayout=True, we prepare the plots to a grid fLayout | |
if fGrid: | |
pPlot = gridfLayout(pList,2) | |
# Plots Results Pt 2: If fLayout=False, we do only one plot | |
else: | |
pPlot = pList[0] | |
# Shows plot | |
show(pPlot) | |
""" circlePlot ----------------------------------------------------------------- | |
Goal: Creates a circle plot from a dictionary of data named after some key | |
(i.e. line) and aligns it to an X-Axis | |
From: Called from bokehPlot | |
Input: (1) The current figure which is being worked on. | |
(2) The master dictionary of data with the (3) key of the smaller | |
dictionary (i.e. current subway line) which is being evaluated in | |
the master dict. | |
We will look in the masterDict[line] for a (4) x-axis key name & | |
(5) y-axis key-names (yKey can be given as a list). | |
This line will be saved on the pLeg as (6) pLeg. Finally, we | |
provide a (7) status if we found data. | |
Output: A list of lists | |
-----------------------------------------------------------------------------""" | |
def circlePlot(p, masterDict, line, xKey, yKey, pLegend, status): | |
clr = ['blue', 'green', 'orange', 'red'] # Colors to plot with | |
# Loops between each yKey key name | |
for idx,subKey in enumerate(yKey): | |
if subKey in masterDict[line]: # Plots if subKey is in our dict | |
status = True | |
x = masterDict[line][xKey] # Creates list of xKey values | |
y = masterDict[line][subKey] # Creates list of yKey values | |
if pLegend is not None: | |
p.circle(x,y, legend=pLegend[idx], color=clr[idx], line_width=2) | |
else: | |
p.circle(x,y, color=clr[idx], line_width=2) | |
return p, status | |
""" gridfLayout ---------------------------------------------------------------- | |
Goal: Takes a list of Bokeh Plots and creates a list of lists out of them | |
so that the list becomes a two-dimensional list, each sub-list with | |
as many values as there are plots in one row. | |
From: Called from bokehPlot | |
Input: (1) The list of plots and (2) how many columns should be made. | |
Output: A list of lists | |
-----------------------------------------------------------------------------""" | |
def gridfLayout(pList, col): | |
pLists = [] # The list of lists | |
# Loops through the list of plots for every 'col'th iteration | |
for i in range(1,len(pList),col): | |
pTemp = [] # Each 'row' or 'dimension' for the list of lists | |
# Loops through the 1st-'col'th plot in each row or dimension | |
# We attempt to append said plot to the row or dimension | |
for j in range(i,i+col,1): | |
try: | |
pTemp.append(pList[j]) | |
except: | |
continue | |
# Appends each row or dimension to the list of lists | |
pLists.append(pTemp) | |
# Creates gridplot based on the list of Lists and returns it | |
return gridplot(pLists) | |
""" combineDict ---------------------------------------------------------------- | |
Goal: Combines two dictionaries and merges them based on their xKey. | |
From: Called from main | |
Input: (1) The first and (2) second dictionary which will be merged based | |
on their (3) xKey key name. In the (4) list of yKey key names, | |
the 'MONTHLY' in each key's name will be replaced by a string for | |
(5) the first and (6) the second dict. | |
Output: The combined dictionary | |
-----------------------------------------------------------------------------""" | |
def combineDict(inDict1, inDict2, yKey, xKey, name1, name2): | |
masterDict = {} | |
# Loops between each key in the first dictionary | |
for line in inDict1: | |
# Only saves data if the data is part of an actual subway line | |
try: | |
masterDict[line] = {} # Creates entry for subway line | |
for subKey in inDict1[line]: # Loops between keys in subway lines | |
subKeyNew = subKey.replace('MONTHLY',name1) | |
masterDict[line][subKeyNew] = inDict1[line][subKey] | |
except: | |
continue | |
# Loops between each key in the second dictionary | |
for line in inDict2: | |
# Only saves data if the data is part of an actual subway line | |
try: | |
for subKey in inDict2[line]: # Loops between keys in subway lines | |
subKeyNew = subKey.replace('MONTHLY',name2) | |
masterDict[line][subKeyNew] = inDict2[line][subKey] | |
except: | |
continue | |
return masterDict | |
""" combineLine ---------------------------------------------------------------- | |
Goal: Combines many lines from one dictionary and merges them based on | |
their xKey. This assumes that each family of line has data recorded | |
over the same period of time | |
From: Called from main | |
Input: (1) The first dictionary which will be merged over the (2) lists | |
of subway lines which should be merged together (this should be a | |
stacked list where the lowest list is the list of all subway lines | |
that should be merged) and the average of each line's (3) key name. | |
The (4) x-axis' key name should be the same across both lines & thus | |
we will copy the longest list of x-axis key names. | |
Output: The combined dictionary where we have keys only for unified lines | |
-----------------------------------------------------------------------------""" | |
def combineLine(inDict, familyList, yKey, xKey): | |
familyDict = {} | |
# Loops between groups of family | |
for family in familyList: | |
valuesList = [] # List of all yKey (MONTHLY_ACTUAL) values | |
averageList = [] # List of the averaged values from valuesList | |
linesIdxList = range(0,len(familyList[family])) | |
# Loops between lines. We append the yKey's values to a list and save | |
# the xKey values as a list. | |
for line in familyList[family]: | |
valuesList.append(list(inDict[line][yKey])) | |
dateRange = inDict[line][xKey] | |
# First loop loops between each 'value' in a subway line | |
# Second loop loops between each month/year. We loads each value from a | |
# date across all subway lines in a family. | |
for valIdx, DoNotUse in enumerate(valuesList[0]): | |
tempList = [valuesList[lineIdx][valIdx] for lineIdx in linesIdxList] | |
try: # Averages the values across all lines of one date | |
averageList.append(np.mean(tempList)) | |
except: # If no value for a date was found, we append '' | |
averageList.append(tempList) | |
# Saves averages and dates to the familyDict. The key is the family of | |
# suwbay lines. | |
familyDict[family] = {yKey: averageList, xKey: dateRange} | |
return familyDict | |
""" lineStats ------------------------------------------------------------------ | |
Goal: Performs a linear regression for each 'line' in the master | |
dictionary to return its slope and r2 value. | |
From: Called from main | |
Input: (1) The master dictionary, the (2) single key of data to analyze, | |
(3) and True=LogRegression or False=LinearRegression | |
Output: A dictionary with the slope, r2 value, and the last data point (that | |
is, the most recent data point obtained) | |
-----------------------------------------------------------------------------""" | |
def lineStats(masterDict, yKey, logReg): | |
outData = {} # Will hold a dictionary with line: (slope, r_value) | |
# Loops between each subway's line in masterDict | |
for line in masterDict: | |
outData[line] = {} # Append line to outData | |
# Loads the data for a given line & key from masterDict if it exists | |
try: | |
yOld = masterDict[line][yKey] | |
except: | |
yOld = [1] | |
yData = [1] | |
# Loops between the items & removes those that isn't '' from | |
yData = [] | |
for item in yOld: | |
if item is not '': | |
yData.append(item) | |
# We can't calculate statistics where the x-axis is a datetime. | |
# Here, we convert all the x points to a simple unique integer by range | |
xData = range(1,len(yData)+1) | |
# We take the most recent data point | |
lastPoint = float(yData[-1]) | |
# Convert the xData & yData lists to Numpy Arrays | |
xData = np.reshape(np.array(xData),(len(xData),1)) | |
if not logReg: | |
yData = np.reshape(np.array(yData),(len(yData),1)) | |
else: | |
yData = np.reshape(np.array(yData),(len(yData),)) | |
# Creates the regression model and fits the dataset | |
if not logReg: | |
clf = linear_model.LinearRegression() | |
else: | |
clf = linear_model.LogisticRegression() | |
clf.fit(xData, yData) | |
# Obtains coefficient/slope & R2/accuracy score of model | |
if not logReg: | |
slope = float(clf.coef_) | |
r2 = float(clf.score(xData, yData)) | |
else: | |
slope = clf.coef_ | |
r2 = 1 | |
# Saves slope, RSquared value, & the last data point to outData | |
outData[line][yKey] = (slope, r2, lastPoint) | |
return outData | |
""" Main Function: Loads all Values -----------------------------------------""" | |
# Accept File as Input Argument | |
inPath = '/Users/Matthew/Dropbox/Academics/AdvanceDS/5_Bokeh/Performance_NYCT.xml' | |
inFile = open(inPath,'r') | |
# List of keys we'll plot over for our xKey, yKey, & total keys | |
xKey = ['PERIOD_MONTH', 'PERIOD_YEAR'] | |
yKey = ['MONTHLY_ACTUAL', 'MONTHLY_TARGET'] | |
subKey = xKey + yKey | |
# Allocates Tuples for XML file import | |
xmlFile = '' | |
waitData = () | |
# Allocates Dictionary to save data for different categories | |
subWait = {} | |
subFail = {} | |
subOTP = {} | |
subTRD = {} | |
# Read XML File | |
for line in inFile: | |
xmlFile = xmlFile + line | |
# Parse XML File into Dictionary | |
xmlDict = xmltodict.parse(xmlFile, process_namespaces=True) | |
# This Dictionary will probably have many incidents where one row contains all | |
# the data since the XML file was formatted strangely. This part of the code | |
# loops through the dictionary until we get the dictionary containing all data. | |
while len(xmlDict) == 1: | |
for key in xmlDict: | |
xmlDict = xmlDict[key] | |
""" Main Function: Saves data to a master dictionary --------------------------- | |
Each data type is saved into its own dict as {line: {month: x, year: y,}} | |
-----------------------------------------------------------------------------""" | |
# Scans through XML file & saves values to a master dictionary | |
for entry in xmlDict: | |
row = entry | |
# Scans through each entry (i.e. Subway line & metric) | |
for item in row: | |
try: | |
# If an entry matches one of these conditions, we save it | |
if 'Subway Wait Assessment' in row[item]: | |
subWait = dictAdd(subWait, row, row['INDICATOR_NAME'], subKey) | |
elif 'Mean Distance Between Failures - Subways' in row[item]: | |
subFail = dictAdd(subFail, row, row['INDICATOR_NAME'], subKey) | |
elif 'OTP (Terminal)' in row[item]: | |
subOTP = dictAdd(subOTP, row, row['INDICATOR_NAME'], subKey) | |
elif 'Total Ridership - Subways' in row[item]: | |
subTRD = dictAdd(subTRD, row, row['INDICATOR_NAME'], subKey) | |
except: | |
continue | |
""" Main Function: Modifies data in the master dictionary -------------------""" | |
# Combines the month & year column into a date column | |
subWait = createDate(subWait, xKey[0], xKey[1], 'PERIOD_DATE') | |
subFail = createDate(subFail, xKey[0], xKey[1], 'PERIOD_DATE') | |
subOTP = createDate(subOTP, xKey[0], xKey[1], 'PERIOD_DATE') | |
subTRD = createDate(subTRD, xKey[0], xKey[1], 'PERIOD_DATE') | |
# Sets standard key and label for the xAxis | |
xKey = 'PERIOD_DATE' | |
pxLabel = 'Date' | |
""" Main Function: Plots each data set to an HTML file ----------------------""" | |
# Creates a plot for each of the conditions for actual vs. target goals | |
bokehPlot(subFail, xKey, yKey, 'Average Miles before Failure', pxLabel, 'Miles', | |
['Actual Miles', 'Target Miles'], 'Failure', False) | |
bokehPlot(subTRD, xKey, yKey, 'Total Ridership', pxLabel, 'Riders', | |
['Actual Riders', 'Target Riders'], 'Riders', False) | |
bokehPlot(subWait, xKey, yKey, 'Average Wait Time', pxLabel, 'Percent', | |
['Actual Wait Time', 'Target Wait Time'], 'Wait', True) | |
bokehPlot(subOTP, xKey, yKey, 'On Time Rate', pxLabel,'Percent', | |
['Actual OTP', 'Target OTP'], 'OTP', True) | |
""" Main Function: Combines multiple data sets for plotting -----------------""" | |
# Combines the OTP and Wait data dictionaries & plots it | |
subOTPWait = combineDict(subOTP, subWait, xKey, yKey, 'OTP', 'WAIT') | |
bokehPlot(subOTPWait, xKey, ['WAIT_ACTUAL', 'OTP_ACTUAL'], 'Wait vs. OTP', | |
pxLabel, 'Percent', ['Wait', 'OTP'], 'OTPWait', True) | |
""" Main Function: Combines multiple subway lines for plotting --------------""" | |
# Creates 'families' of subway lines | |
familyList = {'ACE': ['ALine', 'CLine', 'ELine'], | |
'BDFM': ['BLine', 'DLine', 'FLine', 'MLine'], 'G': ['GLine'], | |
'L': ['LLine'], 'JZ': ['JZLine'], | |
'NQR': ['NLine', 'QLine','RLine'], | |
'123': ['1Line', '2Line', '3Line'], | |
'456': ['4Line', '5Line', '6Line'], | |
'7': ['7Line'], 'SFkln': ['SLineFkln'], | |
'S42': ['SLine42St'], 'SRock': ['SLineRock']} | |
# Combines 'families' of subway lines for plotting | |
subOTPFamily = combineLine(subOTP, familyList, 'MONTHLY_ACTUAL', xKey) | |
subWaitFamily = combineLine(subWait, familyList, 'MONTHLY_ACTUAL', xKey) | |
# Plots 'families' from the above data set | |
bokehPlot(subWaitFamily, xKey, yKey, 'Average Wait Time (Families)', pxLabel, | |
'Percent', ['Actual Wait Time', 'Target Wait Time'], 'WaitFamily', | |
True) | |
bokehPlot(subOTPFamily, xKey, yKey, 'On Time Rate (Families)', pxLabel, | |
'Percent', ['Actual OTP', 'Target OTP'], 'OTPFamily', True) | |
""" Main Function: Combines multiple data sets & subway lines for plotting --""" | |
# Combines the OTP and Wait data dictionaries | |
subOTPWaitFamily = combineDict(subOTPFamily, subWaitFamily, xKey, yKey, | |
'OTP', 'WAIT') | |
# Plots the combined dataset dictionary | |
bokehPlot(subOTPWaitFamily, xKey, ['WAIT_ACTUAL', 'OTP_ACTUAL'], | |
'Wait vs. OTP (Families)', pxLabel, 'Percent', ['Wait', 'OTP'], | |
'OTPWaitFamily', True) | |
""" Main Function: Calculates slope statistics for each line ----------------""" | |
# Calculates Slope & the last MONTHLY_ACTUAL value for subOTP & subWait | |
subTRDStats = lineStats(subTRD, 'MONTHLY_ACTUAL', False) | |
subOTPStats = lineStats(subOTP, 'MONTHLY_ACTUAL', False) | |
subWaitStats = lineStats(subWait, 'MONTHLY_ACTUAL', False) | |
subFailStats = lineStats(subFail, 'MONTHLY_ACTUAL', False) | |
# Creates CSV Files for the Statistics | |
with open(('subOTPStats.csv'), 'wt') as f: | |
csv_writer = csv.writer(f, delimiter=',') | |
csv_writer.writerows(sorted(subOTPStats.items())) | |
with open(('subWaitStats.csv'), 'wt') as f: | |
csv_writer = csv.writer(f, delimiter=',') | |
csv_writer.writerows(sorted(subWaitStats.items())) | |
with open(('subTRDStats.csv'), 'wt') as f: | |
csv_writer = csv.writer(f, delimiter=',') | |
csv_writer.writerows(sorted(subTRDStats.items())) | |
with open(('subFailStats.csv'), 'wt') as f: | |
csv_writer = csv.writer(f, delimiter=',') | |
csv_writer.writerows(sorted(subFailStats.items())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment