Instantly share code, notes, and snippets.
dottyz
/ story_bike_share_clean_1.py
Created
May 2, 2019 18:24
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import timedelta | |
from fuzzywuzzy import fuzz | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import requests | |
import seaborn as sns | |
import json | |
import os |
dottyz
/ story_bike_share_clean_2.py
Created
May 2, 2019 18:29
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Identify the date structure used by each of the files as a dict: | |
# * Key: data file name | |
# * Value: [datetime format, hour difference bewteen timezone used and Eastern timezone] | |
date_formats = { | |
'Bikeshare Ridership (2017 Q1).csv': ['%d/%m/%Y %H:%M', -4], | |
'Bikeshare Ridership (2017 Q2).csv': ['%d/%m/%Y %H:%M', -4], | |
'Bikeshare Ridership (2017 Q3).csv': ['%m/%d/%Y %H:%M', 0], | |
'Bikeshare Ridership (2017 Q4).csv': ['%m/%d/%y %H:%M:%S', 0], | |
} | |
df = pd.DataFrame() # Initiate an empty DataFrame |
dottyz
/ story_bike_share_clean_3.py
Created
May 2, 2019 18:30
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stations_start = df[['from_station_id', 'from_station_name']] | |
stations_end = df[['to_station_id', 'to_station_name']] | |
stations_start.columns = stations_end.columns = ['station_id', 'name'] | |
# Extracts the unique station ID and name combination from the from_station and to_station columns | |
stations = pd.concat([stations_start, stations_end]).dropna(how='all').drop_duplicates().reset_index(drop=True) |
dottyz
/ story_bike_share_clean_4.py
Created
May 2, 2019 18:31
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Separate the stations without station IDs | |
no_ids = stations[stations['station_id'].isnull()] | |
for idx, miss in no_ids.iterrows(): | |
max_score = 0 | |
# Compare the similarity of the station without ID to each station in the API data | |
for i, exist in bikeshare_stations[['station_id', 'name']].iterrows(): | |
score = fuzz.ratio(miss['name'], exist['name']) | |
if score > 80 and score > max_score: |
dottyz
/ story_bike_share_clean_5.py
Created
May 2, 2019 18:32
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stations = pd.concat([stations[~stations['station_id'].isnull()], no_ids])\ | |
.merge(bikeshare_stations[['station_id', 'lat', 'lon']], how='inner', on='station_id')\ | |
.drop_duplicates() | |
df = df.merge(stations, how='inner', left_on='from_station_name', right_on='name') \ | |
.merge(stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_from', '_to']) \ | |
.drop_duplicates() | |
df = df[[x for x in df.columns if not x.endswith('_station_id') and not x.endswith('_station_name') and x != 'trip_stop_time']] |
dottyz
/ story_bike_share_clean_6.py
Created
May 2, 2019 18:32
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fig, ax = plt.subplots(1, 1, figsize=(16, 9)) | |
sns.distplot(df['trip_duration_seconds'], hist=False, ax=ax) |
dottyz
/ story_bike_share_clean_7.py
Created
May 2, 2019 18:34
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Removing false start trips | |
df = df[(df['trip_duration_seconds']>=60)] | |
# Removing outliers | |
q1 = df['trip_duration_seconds'].quantile(0.25) | |
q3 = df['trip_duration_seconds'].quantile(0.75) | |
interquartile_range = q3 - q1 | |
df = df[~((df['trip_duration_seconds'] < (q1 - 1.5 * interquartile_range)) \ |
dottyz
/ story_bike_share_clean_8.py
Created
May 2, 2019 18:35
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df.to_csv('./data/bikeshare_ridership.csv', index=False) |
dottyz
/ story_bike_share_analyze_1.py
Created
May 2, 2019 18:36
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime as dt | |
import re | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from geopy import distance | |
from pandas.api.types import CategoricalDtype |
dottyz
/ story_bike_share_analyze_2.py
Created
May 2, 2019 18:36
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Clean up column names for ease of use | |
df.columns = [' '.join(x.replace('trip_', '').replace('_seconds', '').split('_')).title() for x in df.columns] | |
df['Start Time'] = pd.to_datetime(df['Start Time']) | |
df['Date'] = df['Start Time'].apply(lambda x: x.strftime('%Y-%m-%d')) | |
df['Quarter'] = df['Start Time'].apply(lambda x: int((int(x.strftime('%m')) - 1) / 3) + 1) | |
df['Month'] = df['Start Time'].apply(lambda x: x.strftime('%B')).astype(month_type) | |
df['Day of Week'] = df['Start Time'].apply(lambda x: x.strftime('%a')).astype(day_type) | |
df['Hour'] = df['Start Time'].apply(lambda x: x.strftime('%H')) |
OlderNewer