Yizhao Tan dottyz

## story_bike_share_clean_1.py
from datetime import timedelta
from fuzzywuzzy import fuzz

import matplotlib.pyplot as plt
import pandas as pd
import requests
import seaborn as sns

import json
import os

## story_bike_share_clean_2.py
# Identify the date structure used by each of the files as a dict:
#      * Key: data file name
#      * Value: [datetime format, hour difference bewteen timezone used and Eastern timezone]
date_formats = {
    'Bikeshare Ridership (2017 Q1).csv': ['%d/%m/%Y %H:%M', -4],
    'Bikeshare Ridership (2017 Q2).csv': ['%d/%m/%Y %H:%M', -4],
    'Bikeshare Ridership (2017 Q3).csv': ['%m/%d/%Y %H:%M', 0],
    'Bikeshare Ridership (2017 Q4).csv': ['%m/%d/%y %H:%M:%S', 0],
}
df = pd.DataFrame() # Initiate an empty DataFrame

## story_bike_share_clean_3.py
stations_start = df[['from_station_id', 'from_station_name']]
stations_end = df[['to_station_id', 'to_station_name']]
stations_start.columns = stations_end.columns = ['station_id', 'name']

# Extracts the unique station ID and name combination from the from_station and to_station columns
stations = pd.concat([stations_start, stations_end]).dropna(how='all').drop_duplicates().reset_index(drop=True)

## story_bike_share_clean_4.py
# Separate the stations without station IDs
no_ids = stations[stations['station_id'].isnull()]
for idx, miss in no_ids.iterrows():
    max_score = 0

    # Compare the similarity of the station without ID to each station in the API data
    for i, exist in bikeshare_stations[['station_id', 'name']].iterrows():
        score = fuzz.ratio(miss['name'], exist['name'])

        if score > 80 and score > max_score:

## story_bike_share_clean_5.py
stations = pd.concat([stations[~stations['station_id'].isnull()], no_ids])\
             .merge(bikeshare_stations[['station_id', 'lat', 'lon']], how='inner', on='station_id')\
             .drop_duplicates()

df = df.merge(stations, how='inner', left_on='from_station_name', right_on='name') \
       .merge(stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_from', '_to']) \
       .drop_duplicates()

df = df[[x for x in df.columns if not x.endswith('_station_id') and not x.endswith('_station_name') and x != 'trip_stop_time']]

## story_bike_share_clean_6.py
fig, ax = plt.subplots(1, 1, figsize=(16, 9))
sns.distplot(df['trip_duration_seconds'], hist=False, ax=ax)

## story_bike_share_clean_7.py
# Removing false start trips
df = df[(df['trip_duration_seconds']>=60)]

# Removing outliers
q1 = df['trip_duration_seconds'].quantile(0.25)
q3 = df['trip_duration_seconds'].quantile(0.75)

interquartile_range = q3 - q1

df = df[~((df['trip_duration_seconds'] < (q1 - 1.5 * interquartile_range)) \

## story_bike_share_clean_8.py
df.to_csv('./data/bikeshare_ridership.csv', index=False)

## story_bike_share_analyze_1.py
import datetime as dt
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from geopy import distance
from pandas.api.types import CategoricalDtype

## story_bike_share_analyze_2.py
# Clean up column names for ease of use
df.columns = [' '.join(x.replace('trip_', '').replace('_seconds', '').split('_')).title() for x in df.columns]

df['Start Time'] = pd.to_datetime(df['Start Time'])

df['Date'] = df['Start Time'].apply(lambda x: x.strftime('%Y-%m-%d'))
df['Quarter'] = df['Start Time'].apply(lambda x: int((int(x.strftime('%m')) - 1) / 3) + 1)
df['Month'] = df['Start Time'].apply(lambda x: x.strftime('%B')).astype(month_type)
df['Day of Week'] = df['Start Time'].apply(lambda x: x.strftime('%a')).astype(day_type)
df['Hour'] = df['Start Time'].apply(lambda x: x.strftime('%H'))
	from datetime import timedelta
	from fuzzywuzzy import fuzz

	import matplotlib.pyplot as plt
	import pandas as pd
	import requests
	import seaborn as sns

	import json
	import os
	# Identify the date structure used by each of the files as a dict:
	# * Key: data file name
	# * Value: [datetime format, hour difference bewteen timezone used and Eastern timezone]
	date_formats = {
	'Bikeshare Ridership (2017 Q1).csv': ['%d/%m/%Y %H:%M', -4],
	'Bikeshare Ridership (2017 Q2).csv': ['%d/%m/%Y %H:%M', -4],
	'Bikeshare Ridership (2017 Q3).csv': ['%m/%d/%Y %H:%M', 0],
	'Bikeshare Ridership (2017 Q4).csv': ['%m/%d/%y %H:%M:%S', 0],
	}
	df = pd.DataFrame() # Initiate an empty DataFrame
	stations_start = df[['from_station_id', 'from_station_name']]
	stations_end = df[['to_station_id', 'to_station_name']]
	stations_start.columns = stations_end.columns = ['station_id', 'name']

	# Extracts the unique station ID and name combination from the from_station and to_station columns
	stations = pd.concat([stations_start, stations_end]).dropna(how='all').drop_duplicates().reset_index(drop=True)
	# Separate the stations without station IDs
	no_ids = stations[stations['station_id'].isnull()]
	for idx, miss in no_ids.iterrows():
	max_score = 0

	# Compare the similarity of the station without ID to each station in the API data
	for i, exist in bikeshare_stations[['station_id', 'name']].iterrows():
	score = fuzz.ratio(miss['name'], exist['name'])

	if score > 80 and score > max_score:
	stations = pd.concat([stations[~stations['station_id'].isnull()], no_ids])\
	.merge(bikeshare_stations[['station_id', 'lat', 'lon']], how='inner', on='station_id')\
	.drop_duplicates()

	df = df.merge(stations, how='inner', left_on='from_station_name', right_on='name') \
	.merge(stations, how='inner', left_on='to_station_name', right_on='name', suffixes=['_from', '_to']) \
	.drop_duplicates()

	df = df[[x for x in df.columns if not x.endswith('_station_id') and not x.endswith('_station_name') and x != 'trip_stop_time']]
	fig, ax = plt.subplots(1, 1, figsize=(16, 9))
	sns.distplot(df['trip_duration_seconds'], hist=False, ax=ax)
	# Removing false start trips
	df = df[(df['trip_duration_seconds']>=60)]

	# Removing outliers
	q1 = df['trip_duration_seconds'].quantile(0.25)
	q3 = df['trip_duration_seconds'].quantile(0.75)

	interquartile_range = q3 - q1

	df = df[~((df['trip_duration_seconds'] < (q1 - 1.5 * interquartile_range)) \
	import datetime as dt
	import re

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns

	from geopy import distance
	from pandas.api.types import CategoricalDtype
	# Clean up column names for ease of use
	df.columns = [' '.join(x.replace('trip_', '').replace('_seconds', '').split('_')).title() for x in df.columns]

	df['Start Time'] = pd.to_datetime(df['Start Time'])

	df['Date'] = df['Start Time'].apply(lambda x: x.strftime('%Y-%m-%d'))
	df['Quarter'] = df['Start Time'].apply(lambda x: int((int(x.strftime('%m')) - 1) / 3) + 1)
	df['Month'] = df['Start Time'].apply(lambda x: x.strftime('%B')).astype(month_type)
	df['Day of Week'] = df['Start Time'].apply(lambda x: x.strftime('%a')).astype(day_type)
	df['Hour'] = df['Start Time'].apply(lambda x: x.strftime('%H'))