from datetime import timedelta
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
import pandas as pd
import requests
import seaborn as sns
import json
import os
# Identify the date structure used by each of the files as a dict:
# * Key: data file name
# * Value: [datetime format, hour difference bewteen timezone used and Eastern timezone]
date_formats = {
'Bikeshare Ridership (2017 Q1).csv': ['%d/%m/%Y %H:%M', -4],
'Bikeshare Ridership (2017 Q2).csv': ['%d/%m/%Y %H:%M', -4],
'Bikeshare Ridership (2017 Q3).csv': ['%m/%d/%Y %H:%M', 0],
'Bikeshare Ridership (2017 Q4).csv': ['%m/%d/%y %H:%M:%S', 0],
df = pd.DataFrame() # Initiate an empty DataFrame
stations_start = df[['from_station_id', 'from_station_name']]
stations_end = df[['to_station_id', 'to_station_name']]
stations_start.columns = stations_end.columns = ['station_id', 'name']
# Extracts the unique station ID and name combination from the from_station and to_station columns
stations = pd.concat([stations_start, stations_end]).dropna(how='all').drop_duplicates().reset_index(drop=True)
# Separate the stations without station IDs
no_ids = stations[stations['station_id'].isnull()]
for idx, miss in no_ids.iterrows():
max_score = 0
# Compare the similarity of the station without ID to each station in the API data
for i, exist in bikeshare_stations[['station_id', 'name']].iterrows():
score = fuzz.ratio(miss['name'], exist['name'])
if score > 80 and score > max_score:
stations = pd.concat([stations[~stations['station_id'].isnull()], no_ids])\
.merge(bikeshare_stations[['station_id', 'lat', 'lon']], how='inner', on='station_id')\
df = df.merge(stations, how='inner', left_on='from_station_name', right_on='name') \
.merge(stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_from', '_to']) \
df = df[[x for x in df.columns if not x.endswith('_station_id') and not x.endswith('_station_name') and x != 'trip_stop_time']]
fig, ax = plt.subplots(1, 1, figsize=(16, 9))
sns.distplot(df['trip_duration_seconds'], hist=False, ax=ax)
# Removing false start trips
df = df[(df['trip_duration_seconds']>=60)]
# Removing outliers
q1 = df['trip_duration_seconds'].quantile(0.25)
q3 = df['trip_duration_seconds'].quantile(0.75)
interquartile_range = q3 - q1
df = df[~((df['trip_duration_seconds'] < (q1 - 1.5 * interquartile_range)) \
df.to_csv('./data/bikeshare_ridership.csv', index=False)
import datetime as dt
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from geopy import distance
from pandas.api.types import CategoricalDtype
# Clean up column names for ease of use
df.columns = [' '.join(x.replace('trip_', '').replace('_seconds', '').split('_')).title() for x in df.columns]
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['Date'] = df['Start Time'].apply(lambda x: x.strftime('%Y-%m-%d'))
df['Quarter'] = df['Start Time'].apply(lambda x: int((int(x.strftime('%m')) - 1) / 3) + 1)
df['Month'] = df['Start Time'].apply(lambda x: x.strftime('%B')).astype(month_type)
df['Day of Week'] = df['Start Time'].apply(lambda x: x.strftime('%a')).astype(day_type)
df['Hour'] = df['Start Time'].apply(lambda x: x.strftime('%H'))