This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Calculating the total number of rides for each station and bike type combination | |
electric_bike_counts = df[df['rideable_type'] == 'electric_bike'].groupby('start_station_name')['ride_id'].count() | |
classic_bike_counts = df[df['rideable_type'] == 'classic_bike'].groupby('start_station_name')['ride_id'].count() | |
# Combining the counts for each station | |
combined_bike_counts = electric_bike_counts.add(classic_bike_counts, fill_value=0) | |
# Sorting stations by total rides and selecting the top stations | |
sorted_combined_counts = combined_bike_counts.sort_values(ascending=False) | |
cumulative_percentage_bikes = sorted_combined_counts.cumsum() / sorted_combined_counts.sum() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Calculating total rides for each station (both start and end) | |
total_rides_per_station = df['start_station_name'].value_counts() + df['end_station_name'].value_counts() | |
# Sorting stations by total rides and selecting the top 20% | |
sorted_total_rides = total_rides_per_station.sort_values(ascending=False) | |
cumulative_percentage_total = sorted_total_rides.cumsum() / sorted_total_rides.sum() | |
top_20_total_stations = sorted_total_rides[cumulative_percentage_total <= 0.20] | |
# Separating the counts for start and end stations for the top 20% | |
top_20_start_counts = df['start_station_name'].value_counts().loc[top_20_total_stations.index] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adjusting to select stations that make up the top 20% of all stations by count | |
# Sorting the station counts | |
sorted_start_stations = df['start_station_name'].value_counts().sort_values(ascending=False) | |
sorted_end_stations = df['end_station_name'].value_counts().sort_values(ascending=False) | |
# Calculating the cumulative percentage | |
cumulative_percentage_start = sorted_start_stations.cumsum() / sorted_start_stations.sum() | |
cumulative_percentage_end = sorted_end_stations.cumsum() / sorted_end_stations.sum() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Selecting categorical columns | |
categorical_columns = df[['rideable_type', 'member_casual']] | |
# Creating histograms for each categorical column | |
plt.figure(figsize=(12, 6)) | |
# Rideable Type | |
plt.subplot(1, 2, 1) | |
df['rideable_type'].value_counts().plot(kind='bar', color='skyblue') | |
plt.title('Rideable Type Frequency') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
# Creating separate histograms for each of the latitude and longitude columns | |
df = divvy_tripdata | |
plt.figure(figsize=(12, 8)) | |
# Start Latitude | |
plt.subplot(2, 2, 1) | |
plt.hist(df['start_lat'], bins=30, color='skyblue', edgecolor='black') | |
plt.axvline(df['start_lat'].mean(), color='red', linestyle='dashed', linewidth=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
# Load the new CSV file | |
file_path_tripdata = '202311-divvy-tripdata.csv' | |
divvy_tripdata = pd.read_csv(file_path_tripdata) | |
# Display the first few rows of the dataframe to understand its structure and content | |
divvy_tripdata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.cluster import KMeans | |
# Clustering | |
# Number of clusters | |
n_clusters = 4 | |
kmeans = KMeans(n_clusters=n_clusters, random_state=0) | |
clusters = kmeans.fit_predict(combined_top_items_daily) | |
# Creating a DataFrame for the PCA components | |
pca_df = pd.DataFrame(principal_components_combined, columns=[f'PC{i+1}' for i in range(n_components_combined)]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Total_Daily_Sales、Explained Variance、Cumulative Explained Varianceの列を除外 | |
data = pca_components_combined_df.drop(columns=['Total_Daily_Sales', 'Explained Variance', 'Cumulative Explained Variance']) | |
# 各行の要素をヒストグラムで表示 | |
for index, row in data.iterrows(): | |
plt.figure(figsize=(8, 4)) # プロットのサイズを設定 | |
plt.xticks(rotation=90) # x軸ラベルを縦に表示 | |
plt.bar(row.index, row.values) # 棒グラフを作成 | |
plt.xlabel('Columns') # x軸ラベル | |
plt.ylabel('Values') # y軸ラベル |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.decomposition import PCA | |
from sklearn.preprocessing import StandardScaler | |
import numpy as np | |
# Filtering out the top items that make up 80% of the sales | |
top_items = item_sales_counts[item_sales_counts.cumsum() <= eighty_percent_threshold].index | |
print('Filtering out the top items that make up 80% of the sales : {}'.format(list(top_items))) | |
# Selecting sales data for these top items | |
filtered_top_items_data = bakery_data[bakery_data['Items'].isin(top_items)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Identifying the items that make up 80% of total sales | |
top_items = item_sales_counts.cumsum().sort_values(ascending=False) | |
top_items = top_items[top_items <= eighty_percent_threshold].index | |
# Filtering data for only the top items | |
filtered_data = bakery_data[bakery_data['Items'].isin(top_items)] | |
# Counting the number of each top item sold each day | |
daily_top_item_sales = filtered_data.groupby(['Date', 'Items']).size().unstack(fill_value=0) |