Skip to content

Instantly share code, notes, and snippets.

from datetime import timedelta
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
import pandas as pd
import requests
import seaborn as sns
import json
import os
# Identify the date structure used by each of the files as a dict:
# * Key: data file name
# * Value: [datetime format, hour difference bewteen timezone used and Eastern timezone]
date_formats = {
'Bikeshare Ridership (2017 Q1).csv': ['%d/%m/%Y %H:%M', -4],
'Bikeshare Ridership (2017 Q2).csv': ['%d/%m/%Y %H:%M', -4],
'Bikeshare Ridership (2017 Q3).csv': ['%m/%d/%Y %H:%M', 0],
'Bikeshare Ridership (2017 Q4).csv': ['%m/%d/%y %H:%M:%S', 0],
}
df = pd.DataFrame() # Initiate an empty DataFrame
stations_start = df[['from_station_id', 'from_station_name']]
stations_end = df[['to_station_id', 'to_station_name']]
stations_start.columns = stations_end.columns = ['station_id', 'name']
# Extracts the unique station ID and name combination from the from_station and to_station columns
stations = pd.concat([stations_start, stations_end]).dropna(how='all').drop_duplicates().reset_index(drop=True)
# Separate the stations without station IDs
no_ids = stations[stations['station_id'].isnull()]
for idx, miss in no_ids.iterrows():
max_score = 0
# Compare the similarity of the station without ID to each station in the API data
for i, exist in bikeshare_stations[['station_id', 'name']].iterrows():
score = fuzz.ratio(miss['name'], exist['name'])
if score > 80 and score > max_score:
stations = pd.concat([stations[~stations['station_id'].isnull()], no_ids])\
.merge(bikeshare_stations[['station_id', 'lat', 'lon']], how='inner', on='station_id')\
.drop_duplicates()
df = df.merge(stations, how='inner', left_on='from_station_name', right_on='name') \
.merge(stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_from', '_to']) \
.drop_duplicates()
df = df[[x for x in df.columns if not x.endswith('_station_id') and not x.endswith('_station_name') and x != 'trip_stop_time']]
fig, ax = plt.subplots(1, 1, figsize=(16, 9))
sns.distplot(df['trip_duration_seconds'], hist=False, ax=ax)
# Removing false start trips
df = df[(df['trip_duration_seconds']>=60)]
# Removing outliers
q1 = df['trip_duration_seconds'].quantile(0.25)
q3 = df['trip_duration_seconds'].quantile(0.75)
interquartile_range = q3 - q1
df = df[~((df['trip_duration_seconds'] < (q1 - 1.5 * interquartile_range)) \
df.to_csv('./data/bikeshare_ridership.csv', index=False)
import datetime as dt
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from geopy import distance
from pandas.api.types import CategoricalDtype
# Clean up column names for ease of use
df.columns = [' '.join(x.replace('trip_', '').replace('_seconds', '').split('_')).title() for x in df.columns]
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['Date'] = df['Start Time'].apply(lambda x: x.strftime('%Y-%m-%d'))
df['Quarter'] = df['Start Time'].apply(lambda x: int((int(x.strftime('%m')) - 1) / 3) + 1)
df['Month'] = df['Start Time'].apply(lambda x: x.strftime('%B')).astype(month_type)
df['Day of Week'] = df['Start Time'].apply(lambda x: x.strftime('%a')).astype(day_type)
df['Hour'] = df['Start Time'].apply(lambda x: x.strftime('%H'))