mwidjaja1/NYCSubway.py

## NYCSubway.py
""" Bokeh (NYC Subway) ---------------------------------------------------------
    Goal:   This script takes the NYC Subway data & parses it for visualization
            techniques.

    Input:  http://web.mta.info/developers/performance.html has performance XML
            data. We only cover subway and we only care for these metrics:
                1. Subway wait assessment for all lines
                   Actual interval between trains
                2. Mean Distance Between Failure
                   Miles until a train hits a mechanical failure causing a delay
                3. On-Time Performance for each line, not the total (OTP)
                   Trains arriving within 5 minutes of its scheduled time.
                4. Total Ridership
                   All customers riding, even though riding w. free transfers.

            Each of those metrics consist of sub-metrics. We only care for:
            MONTHLY_ACTUAL, MONTHLY_TARGET, PERIOD_MONTH, & PERIOD_YEAR

    Output: We get CSV files (for the slope/growth of each statistic) and/or
            HTML plots for each of the following categories:
            1.  Failure.html & subFailStats.csv:
                Amount of failures across all lines in the subway system
            2.  OTP.html & subOTPStats.csv:
                On time performance for each subway line in the system
            3.  OTPFamily.html:
                #2 for each family of subway lines
            4.  OTPWait.html:
                On time performance & wait time assessments for each subway line
            5.  OTPWaitFamily.html:
                #4 but for each family of subway lines
            6.  Riders.html & subTRDStats.csv:
                Amount of riders across all lines in the subway system
            7.  Wait.html & subWaitStats.csv:
                Wait time assessment for each subway line in the system
            8.  WaitFamily.html
                #7 for each family of subway lines
-----------------------------------------------------------------------------"""

import numpy as np
from bokeh.plotting import figure, output_file, show, gridplot
from sklearn import linear_model
import csv
import datetime as dt
import xmltodict

""" convertVal -----------------------------------------------------------------
    Goal:   This converts a (list of) value(s) to a float if possible.
    From:   Called from dictAdd

    Input:  (1) The list of values
    Output: A converted list of values
-----------------------------------------------------------------------------"""
def convertVal(value):
    try:
        value = value.replace(',', '') # Removes commas from numbers
        value = float(value)
    except:
        value = ''
    return value


""" dictAdd --------------------------------------------------------------------
    Goal:   We are given a dictionary for each item (like a subway line). This
            function takes each dictionary and appends it to a master dictionary
            which contains data for all of the items (like every subway line).
            If there are multiple dictionaries for each item, we'll append the
            values to the master dictionary so that no values get deleted.
    From:   Called from main

    Input:  (1) A master dictionary (of all subway lines) where (2) a smaller
            dictionary (of one subway line) will be added to.

            We will save (4) certain keys of data to this smaller dictionary &
            this smalelr dictionary will be saved to the larger dictionary under
            (3) one key (i.e. subway line).

    Output: A master dictionary which has the smaller dictionary appended.
-----------------------------------------------------------------------------"""
def dictAdd(masterDict, smallDict, oldLine, keys):
    # Renames the subway line (i.e. key) so it only includes the subway line
    # without any spaces
    try:
        line = oldLine.split('-')[1]
        line = line.replace(' ', '')
    except:
        line = oldLine.replace(' ', '')

    # The S Line is named inconsistently so this renames them
    if 'S' in line:
        if '42St' in line:
            line = 'SLine42St'
        elif 'Fkln' in line:
            line = 'SLineFkln'
        elif 'Rock' in line:
            line = 'SLineRock'
        else:
            line = 'SLine'

    # Test 1: Checks if this key type is in the dictionary. If so, we append
    # this new value to the already existing value for this key type.
    if line in masterDict:
        tempDict = []       # Allocates list for existing values
        for subKey in keys:  # Loops through keys that we care about
            tempDict = list(masterDict[line][subKey]) # Saves existing vals
            newValue = convertVal(smallDict[subKey])  # Converts to float
            tempDict.append(newValue)           # Adds new val to existing vals
            masterDict[line][subKey] = tempDict       # Replaces existing vals

    # Test 1: Else, we add these entire set of values to the dict.
    else:
        masterDict[line] = {}   # Allocates dictionary for the key
        for subKey in keys:     # Saves each key in the 2nd dict to the 1st dict
            newValue = convertVal(smallDict[subKey])
            masterDict[line][subKey] = [newValue]

    return masterDict


""" createDate -----------------------------------------------------------------
    Goal:   Combines the values of two keys in our dictionary. In our case,
            we'll do this to combine the month + year column = date column.
            Because of this, we assume that all of the values in the oldKeys
            should be intergers.
    From:   Called from main

    Input:  (1) The dictionary of data with the (2) first key & (3) second key
            which will be combined and saved as (4) a new key name.

    Output: The dictionary of data where the two columns are added.
-----------------------------------------------------------------------------"""
def createDate(masterDict, oldKey1, oldKey2, newKey):
    for line in masterDict:
        # Gets the values for the first and second old key
        old1 = [str(int(value)) for value in masterDict[line][oldKey1]]
        old2 = [str(int(value)) for value in masterDict[line][oldKey2]]

        # Combines the values of the first and second key
        new1 = zip(old1,old2)   # Creates tuples with old values 1 & 2
        new2 = []               # Allocates list to convert tuple to float
        for item in new1:
            date = item[1] + '/' + item[0]
            new2.append(dt.datetime.strptime(date, '%Y/%m'))

        # Saves list as newKey to the masterDict
        masterDict[line][newKey] = new2

    return masterDict


""" bokehPlot ------------------------------------------------------------------
    Goal:   Makes a plot for the actual & target value per each month & year.
    From:   Called from main

    Input:  (1) The dictionary of data with the (2) x-axis key name & the
            (3) y-axis key-name (can be a list) from said dictionary.

            A plot will be saved onto an HTML file with a (4) pTitle,
            (5) x-axis label, (6) y-axis label, & (7) the names of each data
            point set for the legend (to skip the legend, set pyLabel=None)

            The plot will be saved as a (8) file name (don't include extension)
            in (9) a True = Grid fLayout or False = One Plot fLayout.

    Output: A HTML plot for the actual and target values in the pwd.
-----------------------------------------------------------------------------"""
def bokehPlot(masterDict, xKey, yKey, pTitle, pxLabel, pyLabel, pLegend, fName, fGrid):
    # Output to static HTML file
    output_file(fName+".html", title=pTitle)

    # Creates variables
    status = False  # Set to true once we found data in the given key
    pList = []      # A list of plots, used primarily if fLayout = True

    # Loops through each subway line in masterDict
    for line in sorted(masterDict):
        # Create a new plot with a pTitle and axis labels
        if fGrid:
            p = figure(title=pTitle+' '+line, width=525, plot_height=350,
                       x_axis_label=pxLabel, x_axis_type = "datetime",
                       y_axis_label=pyLabel, title_text_font_size='14pt')
        else:
            p = figure(title=pTitle+' '+line, x_axis_label=pxLabel,
                       x_axis_type = "datetime", y_axis_label=pyLabel)

        # Creates a circle plot to the figure
        p, status = circlePlot(p, masterDict, line, xKey, yKey, pLegend, status)

        # Append this plot to the list of all plots
        if status:
            p.left[0].formatter.use_scientific = False  # Removes sci notation
            pList.append(p)                             # Saves data to pList

    # Plots Results Pt 1: If fLayout=True, we prepare the plots to a grid fLayout
    if fGrid:
        pPlot = gridfLayout(pList,2)
    # Plots Results Pt 2: If fLayout=False, we do only one plot
    else:
        pPlot = pList[0]

    # Shows plot
    show(pPlot)


""" circlePlot -----------------------------------------------------------------
    Goal:   Creates a circle plot from a dictionary of data named after some key
            (i.e. line) and aligns it to an X-Axis
    From:   Called from bokehPlot

    Input:  (1) The current figure which is being worked on.

            (2) The master dictionary of data with the (3) key of the smaller
            dictionary (i.e. current subway line) which is being evaluated in
            the master dict.

            We will look in the masterDict[line] for a (4) x-axis key name &
            (5) y-axis key-names (yKey can be given as a list).

            This line will be saved on the pLeg as (6) pLeg. Finally, we
            provide a (7) status if we found data.
    Output: A list of lists
-----------------------------------------------------------------------------"""
def circlePlot(p, masterDict, line, xKey, yKey, pLegend, status):
    clr = ['blue', 'green', 'orange', 'red']  # Colors to plot with

    # Loops between each yKey key name
    for idx,subKey in enumerate(yKey):
        if subKey in masterDict[line]:    # Plots if subKey is in our dict
            status = True
            x = masterDict[line][xKey]   # Creates list of xKey values
            y = masterDict[line][subKey]  # Creates list of yKey values
            if pLegend is not None:
                p.circle(x,y, legend=pLegend[idx], color=clr[idx], line_width=2)
            else:
                p.circle(x,y, color=clr[idx], line_width=2)

    return p, status


""" gridfLayout ----------------------------------------------------------------
    Goal:   Takes a list of Bokeh Plots and creates a list of lists out of them
            so that the list becomes a two-dimensional list, each sub-list with
            as many values as there are plots in one row.
    From:   Called from bokehPlot

    Input:  (1) The list of plots and (2) how many columns should be made.
    Output: A list of lists
-----------------------------------------------------------------------------"""
def gridfLayout(pList, col):
    pLists = []     # The list of lists

    # Loops through the list of plots for every 'col'th iteration
    for i in range(1,len(pList),col):
        pTemp = []  # Each 'row' or 'dimension' for the list of lists

        # Loops through the 1st-'col'th plot in each row or dimension
        # We attempt to append said plot to the row or dimension
        for j in range(i,i+col,1):
            try:
                pTemp.append(pList[j])
            except:
                continue

        # Appends each row or dimension to the list of lists
        pLists.append(pTemp)

    # Creates gridplot based on the list of Lists and returns it
    return gridplot(pLists)


""" combineDict ----------------------------------------------------------------
    Goal:   Combines two dictionaries and merges them based on their xKey.
    From:   Called from main

    Input:  (1) The first and (2) second dictionary which will be merged based
            on their (3) xKey key name. In the (4) list of yKey key names,
            the 'MONTHLY' in each key's name will be replaced by a string for
            (5) the first and (6) the second dict.
    Output: The combined dictionary
-----------------------------------------------------------------------------"""
def combineDict(inDict1, inDict2, yKey, xKey, name1, name2):
    masterDict = {}

    # Loops between each key in the first dictionary
    for line in inDict1:
        # Only saves data if the data is part of an actual subway line
        try:
            masterDict[line] = {}         # Creates entry for subway line
            for subKey in inDict1[line]:  # Loops between keys in subway lines
                subKeyNew = subKey.replace('MONTHLY',name1)
                masterDict[line][subKeyNew] = inDict1[line][subKey]
        except:
            continue

    # Loops between each key in the second dictionary
    for line in inDict2:
        # Only saves data if the data is part of an actual subway line
        try:
            for subKey in inDict2[line]:  # Loops between keys in subway lines
                subKeyNew = subKey.replace('MONTHLY',name2)
                masterDict[line][subKeyNew] = inDict2[line][subKey]
        except:
            continue

    return masterDict


""" combineLine ----------------------------------------------------------------
    Goal:   Combines many lines from one dictionary and merges them based on
            their xKey. This assumes that each family of line has data recorded
            over the same period of time
    From:   Called from main

    Input:  (1) The first dictionary which will be merged over the (2) lists
            of subway lines which should be merged together (this should be a
            stacked list where the lowest list is the list of all subway lines
            that should be merged) and the average of each line's (3) key name.

            The (4) x-axis' key name should be the same across both lines & thus
            we will copy the longest list of x-axis key names.

    Output: The combined dictionary where we have keys only for unified lines
-----------------------------------------------------------------------------"""
def combineLine(inDict, familyList, yKey, xKey):
    familyDict = {}

    # Loops between groups of family
    for family in familyList:
        valuesList = []     # List of all yKey (MONTHLY_ACTUAL) values
        averageList = []    # List of the averaged values from valuesList
        linesIdxList = range(0,len(familyList[family]))

        # Loops between lines. We append the yKey's values to a list and save
        #   the xKey values as a list.
        for line in familyList[family]:
            valuesList.append(list(inDict[line][yKey]))
            dateRange = inDict[line][xKey]

        # First loop loops between each 'value' in a subway line
        # Second loop loops between each month/year. We loads each value from a
        #   date across all subway lines in a family.
        for valIdx, DoNotUse in enumerate(valuesList[0]):
            tempList = [valuesList[lineIdx][valIdx] for lineIdx in linesIdxList]
            try:    # Averages the values across all lines of one date
                averageList.append(np.mean(tempList))
            except: # If no value for a date was found, we append ''
                averageList.append(tempList)

        # Saves averages and dates to the familyDict. The key is the family of
        #   suwbay lines.
        familyDict[family] = {yKey: averageList, xKey: dateRange}

    return familyDict


""" lineStats ------------------------------------------------------------------
    Goal:   Performs a linear regression for each 'line' in the master
            dictionary to return its slope and r2 value.
    From:   Called from main

    Input:  (1) The master dictionary, the (2) single key of data to analyze,
            (3) and True=LogRegression or False=LinearRegression
    Output: A dictionary with the slope, r2 value, and the last data point (that
            is, the most recent data point obtained)
-----------------------------------------------------------------------------"""
def lineStats(masterDict, yKey, logReg):
    outData = {}    # Will hold a dictionary with line: (slope, r_value)

    # Loops between each subway's line in masterDict
    for line in masterDict:
        outData[line] = {}  # Append line to outData

        # Loads the data for a given line & key from masterDict if it exists
        try:
            yOld = masterDict[line][yKey]
        except:
            yOld = [1]
            yData = [1]

        # Loops between the items & removes those that isn't '' from
        yData = []
        for item in yOld:
            if item is not '':
                yData.append(item)

        # We can't calculate statistics where the x-axis is a datetime.
        # Here, we convert all the x points to a simple unique integer by range
        xData = range(1,len(yData)+1)

        # We take the most recent data point
        lastPoint = float(yData[-1])

        # Convert the xData & yData lists to Numpy Arrays
        xData = np.reshape(np.array(xData),(len(xData),1))
        if not logReg:
            yData = np.reshape(np.array(yData),(len(yData),1))
        else:
            yData = np.reshape(np.array(yData),(len(yData),))

        # Creates the regression model and fits the dataset
        if not logReg:
            clf = linear_model.LinearRegression()
        else:
            clf = linear_model.LogisticRegression()
        clf.fit(xData, yData)

        # Obtains coefficient/slope & R2/accuracy score of model
        if not logReg:
            slope = float(clf.coef_)
            r2 = float(clf.score(xData, yData))
        else:
            slope = clf.coef_
            r2 = 1

        # Saves slope, RSquared value, & the last data point to outData
        outData[line][yKey] = (slope, r2, lastPoint)

    return outData


""" Main Function: Loads all Values -----------------------------------------"""
# Accept File as Input Argument
inPath = '/Users/Matthew/Dropbox/Academics/AdvanceDS/5_Bokeh/Performance_NYCT.xml'
inFile = open(inPath,'r')

# List of keys we'll plot over for our xKey, yKey, & total keys
xKey = ['PERIOD_MONTH', 'PERIOD_YEAR']
yKey = ['MONTHLY_ACTUAL', 'MONTHLY_TARGET']
subKey = xKey + yKey

# Allocates Tuples for XML file import
xmlFile = ''
waitData = ()

# Allocates Dictionary to save data for different categories
subWait = {}
subFail = {}
subOTP = {}
subTRD = {}

# Read XML File
for line in inFile:
    xmlFile = xmlFile + line

# Parse XML File into Dictionary
xmlDict = xmltodict.parse(xmlFile, process_namespaces=True)

# This Dictionary will probably have many incidents where one row contains all
# the data since the XML file was formatted strangely. This part of the code
# loops through the dictionary until we get the dictionary containing all data.
while len(xmlDict) == 1:
    for key in xmlDict:
        xmlDict = xmlDict[key]


""" Main Function: Saves data to a master dictionary ---------------------------
    Each data type is saved into its own dict as {line: {month: x, year: y,}}
-----------------------------------------------------------------------------"""
# Scans through XML file & saves values to a master dictionary
for entry in xmlDict:
    row = entry
    # Scans through each entry (i.e. Subway line & metric)
    for item in row:
        try:
            # If an entry matches one of these conditions, we save it
            if 'Subway Wait Assessment' in row[item]:
                subWait = dictAdd(subWait, row, row['INDICATOR_NAME'], subKey)
            elif 'Mean Distance Between Failures - Subways' in row[item]:
                subFail = dictAdd(subFail, row, row['INDICATOR_NAME'], subKey)
            elif 'OTP (Terminal)' in row[item]:
                subOTP = dictAdd(subOTP, row, row['INDICATOR_NAME'], subKey)
            elif 'Total Ridership - Subways' in row[item]:
                subTRD = dictAdd(subTRD, row, row['INDICATOR_NAME'], subKey)
        except:
            continue


""" Main Function: Modifies data in the master dictionary -------------------"""
# Combines the month & year column into a date column
subWait = createDate(subWait, xKey[0], xKey[1], 'PERIOD_DATE')
subFail = createDate(subFail, xKey[0], xKey[1], 'PERIOD_DATE')
subOTP = createDate(subOTP, xKey[0], xKey[1], 'PERIOD_DATE')
subTRD = createDate(subTRD, xKey[0], xKey[1], 'PERIOD_DATE')

# Sets standard key and label for the xAxis
xKey = 'PERIOD_DATE'
pxLabel = 'Date'


""" Main Function: Plots each data set to an HTML file ----------------------"""
# Creates a plot for each of the conditions for actual vs. target goals
bokehPlot(subFail, xKey, yKey, 'Average Miles before Failure', pxLabel, 'Miles',
           ['Actual Miles', 'Target Miles'], 'Failure', False)
bokehPlot(subTRD, xKey, yKey, 'Total Ridership', pxLabel, 'Riders',
           ['Actual Riders', 'Target Riders'], 'Riders', False)
bokehPlot(subWait, xKey, yKey, 'Average Wait Time', pxLabel, 'Percent',
           ['Actual Wait Time', 'Target Wait Time'], 'Wait', True)
bokehPlot(subOTP, xKey, yKey, 'On Time Rate', pxLabel,'Percent',
           ['Actual OTP', 'Target OTP'], 'OTP', True)


""" Main Function: Combines multiple data sets for plotting -----------------"""
# Combines the OTP and Wait data dictionaries & plots it
subOTPWait = combineDict(subOTP, subWait, xKey, yKey, 'OTP', 'WAIT')
bokehPlot(subOTPWait, xKey, ['WAIT_ACTUAL', 'OTP_ACTUAL'], 'Wait vs. OTP',
          pxLabel, 'Percent', ['Wait', 'OTP'], 'OTPWait', True)


""" Main Function: Combines multiple subway lines for plotting --------------"""
# Creates 'families' of subway lines
familyList = {'ACE': ['ALine', 'CLine', 'ELine'],
              'BDFM': ['BLine', 'DLine', 'FLine', 'MLine'], 'G': ['GLine'],
              'L': ['LLine'], 'JZ': ['JZLine'],
              'NQR': ['NLine', 'QLine','RLine'],
              '123': ['1Line', '2Line', '3Line'],
              '456': ['4Line', '5Line', '6Line'],
              '7': ['7Line'], 'SFkln': ['SLineFkln'],
              'S42': ['SLine42St'], 'SRock': ['SLineRock']}

# Combines 'families' of subway lines for plotting
subOTPFamily = combineLine(subOTP, familyList, 'MONTHLY_ACTUAL', xKey)
subWaitFamily = combineLine(subWait, familyList, 'MONTHLY_ACTUAL', xKey)

# Plots 'families' from the above data set
bokehPlot(subWaitFamily, xKey, yKey, 'Average Wait Time (Families)', pxLabel,
           'Percent', ['Actual Wait Time', 'Target Wait Time'], 'WaitFamily',
           True)
bokehPlot(subOTPFamily, xKey, yKey, 'On Time Rate (Families)', pxLabel,
           'Percent', ['Actual OTP', 'Target OTP'], 'OTPFamily', True)


""" Main Function: Combines multiple data sets & subway lines for plotting --"""
# Combines the OTP and Wait data dictionaries
subOTPWaitFamily = combineDict(subOTPFamily, subWaitFamily, xKey, yKey,
                               'OTP', 'WAIT')

# Plots the combined dataset dictionary
bokehPlot(subOTPWaitFamily, xKey, ['WAIT_ACTUAL', 'OTP_ACTUAL'],
          'Wait vs. OTP (Families)', pxLabel, 'Percent',  ['Wait', 'OTP'],
          'OTPWaitFamily', True)


""" Main Function: Calculates slope statistics for each line ----------------"""
# Calculates Slope & the last MONTHLY_ACTUAL value for subOTP & subWait
subTRDStats = lineStats(subTRD, 'MONTHLY_ACTUAL', False)
subOTPStats = lineStats(subOTP, 'MONTHLY_ACTUAL', False)
subWaitStats = lineStats(subWait, 'MONTHLY_ACTUAL', False)
subFailStats = lineStats(subFail, 'MONTHLY_ACTUAL', False)

# Creates CSV Files for the Statistics
with open(('subOTPStats.csv'), 'wt') as f:
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerows(sorted(subOTPStats.items()))

with open(('subWaitStats.csv'), 'wt') as f:
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerows(sorted(subWaitStats.items()))

with open(('subTRDStats.csv'), 'wt') as f:
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerows(sorted(subTRDStats.items()))

with open(('subFailStats.csv'), 'wt') as f:
    csv_writer = csv.writer(f, delimiter=',')
    csv_writer.writerows(sorted(subFailStats.items()))