Skip to content

Instantly share code, notes, and snippets.

@smzn
smzn / gist:444f620397c9170f6d9446e4a9e75aef
Created January 4, 2024 03:10
the total number of rides for each station and bike type combination
# Calculating the total number of rides for each station and bike type combination
electric_bike_counts = df[df['rideable_type'] == 'electric_bike'].groupby('start_station_name')['ride_id'].count()
classic_bike_counts = df[df['rideable_type'] == 'classic_bike'].groupby('start_station_name')['ride_id'].count()
# Combining the counts for each station
combined_bike_counts = electric_bike_counts.add(classic_bike_counts, fill_value=0)
# Sorting stations by total rides and selecting the top stations
sorted_combined_counts = combined_bike_counts.sort_values(ascending=False)
cumulative_percentage_bikes = sorted_combined_counts.cumsum() / sorted_combined_counts.sum()
@smzn
smzn / gist:1068a34bfd9e40f588afa47ea588cc49
Created January 4, 2024 02:47
total rides for each station (both start and end)
# Calculating total rides for each station (both start and end)
total_rides_per_station = df['start_station_name'].value_counts() + df['end_station_name'].value_counts()
# Sorting stations by total rides and selecting the top 20%
sorted_total_rides = total_rides_per_station.sort_values(ascending=False)
cumulative_percentage_total = sorted_total_rides.cumsum() / sorted_total_rides.sum()
top_20_total_stations = sorted_total_rides[cumulative_percentage_total <= 0.20]
# Separating the counts for start and end stations for the top 20%
top_20_start_counts = df['start_station_name'].value_counts().loc[top_20_total_stations.index]
@smzn
smzn / gist:1ade1b6bb8530fbbf4ac1db8600ba395
Created January 4, 2024 02:46
Adjusting to select stations that make up the top 20% of all stations by count
# Adjusting to select stations that make up the top 20% of all stations by count
# Sorting the station counts
sorted_start_stations = df['start_station_name'].value_counts().sort_values(ascending=False)
sorted_end_stations = df['end_station_name'].value_counts().sort_values(ascending=False)
# Calculating the cumulative percentage
cumulative_percentage_start = sorted_start_stations.cumsum() / sorted_start_stations.sum()
cumulative_percentage_end = sorted_end_stations.cumsum() / sorted_end_stations.sum()
@smzn
smzn / gist:d40ae7adbad91fe42395d22a6b9fd80d
Created January 4, 2024 02:06
histograms for each categorical column
# Selecting categorical columns
categorical_columns = df[['rideable_type', 'member_casual']]
# Creating histograms for each categorical column
plt.figure(figsize=(12, 6))
# Rideable Type
plt.subplot(1, 2, 1)
df['rideable_type'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Rideable Type Frequency')
@smzn
smzn / gist:098114fab724eda7a77186c596d0cd0d
Created January 4, 2024 01:56
histograms for each of the latitude and longitude columns
import matplotlib.pyplot as plt
# Creating separate histograms for each of the latitude and longitude columns
df = divvy_tripdata
plt.figure(figsize=(12, 8))
# Start Latitude
plt.subplot(2, 2, 1)
plt.hist(df['start_lat'], bins=30, color='skyblue', edgecolor='black')
plt.axvline(df['start_lat'].mean(), color='red', linestyle='dashed', linewidth=1)
@smzn
smzn / gist:9988aff6f2b5cef4b8d2c2de2b8e47da
Created January 4, 2024 01:38
202311-divvy-tripdata.csv
import pandas as pd
# Load the new CSV file
file_path_tripdata = '202311-divvy-tripdata.csv'
divvy_tripdata = pd.read_csv(file_path_tripdata)
# Display the first few rows of the dataframe to understand its structure and content
divvy_tripdata
from sklearn.cluster import KMeans
# Clustering
# Number of clusters
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
clusters = kmeans.fit_predict(combined_top_items_daily)
# Creating a DataFrame for the PCA components
pca_df = pd.DataFrame(principal_components_combined, columns=[f'PC{i+1}' for i in range(n_components_combined)])
@smzn
smzn / gist:1cfcad7a4122eb02af08ee3cabe46c38
Created December 27, 2023 01:49
各主成分の要素をヒストグラムで表示
# Total_Daily_Sales、Explained Variance、Cumulative Explained Varianceの列を除外
data = pca_components_combined_df.drop(columns=['Total_Daily_Sales', 'Explained Variance', 'Cumulative Explained Variance'])
# 各行の要素をヒストグラムで表示
for index, row in data.iterrows():
plt.figure(figsize=(8, 4)) # プロットのサイズを設定
plt.xticks(rotation=90) # x軸ラベルを縦に表示
plt.bar(row.index, row.values) # 棒グラフを作成
plt.xlabel('Columns') # x軸ラベル
plt.ylabel('Values') # y軸ラベル
@smzn
smzn / gist:a61154820c85a1957154ba63a837d378
Last active December 27, 2023 01:23
Applying PCA to the combined data
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
# Filtering out the top items that make up 80% of the sales
top_items = item_sales_counts[item_sales_counts.cumsum() <= eighty_percent_threshold].index
print('Filtering out the top items that make up 80% of the sales : {}'.format(list(top_items)))
# Selecting sales data for these top items
filtered_top_items_data = bakery_data[bakery_data['Items'].isin(top_items)]
@smzn
smzn / gist:f1a41feae17a7b3d720b9442c7dbf430
Created December 26, 2023 06:59
Counting the number of each top item sold each day
# Identifying the items that make up 80% of total sales
top_items = item_sales_counts.cumsum().sort_values(ascending=False)
top_items = top_items[top_items <= eighty_percent_threshold].index
# Filtering data for only the top items
filtered_data = bakery_data[bakery_data['Items'].isin(top_items)]
# Counting the number of each top item sold each day
daily_top_item_sales = filtered_data.groupby(['Date', 'Items']).size().unstack(fill_value=0)