smzn

## gist:444f620397c9170f6d9446e4a9e75aef
# Calculating the total number of rides for each station and bike type combination
electric_bike_counts = df[df['rideable_type'] == 'electric_bike'].groupby('start_station_name')['ride_id'].count()
classic_bike_counts = df[df['rideable_type'] == 'classic_bike'].groupby('start_station_name')['ride_id'].count()

# Combining the counts for each station
combined_bike_counts = electric_bike_counts.add(classic_bike_counts, fill_value=0)

# Sorting stations by total rides and selecting the top stations
sorted_combined_counts = combined_bike_counts.sort_values(ascending=False)
cumulative_percentage_bikes = sorted_combined_counts.cumsum() / sorted_combined_counts.sum()

## gist:1068a34bfd9e40f588afa47ea588cc49
# Calculating total rides for each station (both start and end)
total_rides_per_station = df['start_station_name'].value_counts() + df['end_station_name'].value_counts()

# Sorting stations by total rides and selecting the top 20%
sorted_total_rides = total_rides_per_station.sort_values(ascending=False)
cumulative_percentage_total = sorted_total_rides.cumsum() / sorted_total_rides.sum()
top_20_total_stations = sorted_total_rides[cumulative_percentage_total <= 0.20]

# Separating the counts for start and end stations for the top 20%
top_20_start_counts = df['start_station_name'].value_counts().loc[top_20_total_stations.index]

## gist:1ade1b6bb8530fbbf4ac1db8600ba395
# Adjusting to select stations that make up the top 20% of all stations by count

# Sorting the station counts
sorted_start_stations = df['start_station_name'].value_counts().sort_values(ascending=False)
sorted_end_stations = df['end_station_name'].value_counts().sort_values(ascending=False)

# Calculating the cumulative percentage
cumulative_percentage_start = sorted_start_stations.cumsum() / sorted_start_stations.sum()
cumulative_percentage_end = sorted_end_stations.cumsum() / sorted_end_stations.sum()

## gist:d40ae7adbad91fe42395d22a6b9fd80d
# Selecting categorical columns
categorical_columns = df[['rideable_type', 'member_casual']]

# Creating histograms for each categorical column
plt.figure(figsize=(12, 6))

# Rideable Type
plt.subplot(1, 2, 1)
df['rideable_type'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Rideable Type Frequency')

## gist:098114fab724eda7a77186c596d0cd0d
import matplotlib.pyplot as plt

# Creating separate histograms for each of the latitude and longitude columns
df = divvy_tripdata
plt.figure(figsize=(12, 8))

# Start Latitude
plt.subplot(2, 2, 1)
plt.hist(df['start_lat'], bins=30, color='skyblue', edgecolor='black')
plt.axvline(df['start_lat'].mean(), color='red', linestyle='dashed', linewidth=1)

## gist:9988aff6f2b5cef4b8d2c2de2b8e47da
import pandas as pd

# Load the new CSV file
file_path_tripdata = '202311-divvy-tripdata.csv'
divvy_tripdata = pd.read_csv(file_path_tripdata)

# Display the first few rows of the dataframe to understand its structure and content
divvy_tripdata

## gist:b81a6c1bd11fe6f0726020f14dd3bf62
from sklearn.cluster import KMeans

# Clustering
# Number of clusters
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
clusters = kmeans.fit_predict(combined_top_items_daily)

# Creating a DataFrame for the PCA components
pca_df = pd.DataFrame(principal_components_combined, columns=[f'PC{i+1}' for i in range(n_components_combined)])

## gist:1cfcad7a4122eb02af08ee3cabe46c38
# Total_Daily_Sales、Explained Variance、Cumulative Explained Varianceの列を除外
data = pca_components_combined_df.drop(columns=['Total_Daily_Sales', 'Explained Variance', 'Cumulative Explained Variance'])

# 各行の要素をヒストグラムで表示
for index, row in data.iterrows():
    plt.figure(figsize=(8, 4))  # プロットのサイズを設定
    plt.xticks(rotation=90)  # x軸ラベルを縦に表示
    plt.bar(row.index, row.values)  # 棒グラフを作成
    plt.xlabel('Columns')  # x軸ラベル
    plt.ylabel('Values')  # y軸ラベル

## gist:a61154820c85a1957154ba63a837d378
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

# Filtering out the top items that make up 80% of the sales
top_items = item_sales_counts[item_sales_counts.cumsum() <= eighty_percent_threshold].index
print('Filtering out the top items that make up 80% of the sales : {}'.format(list(top_items)))

# Selecting sales data for these top items
filtered_top_items_data = bakery_data[bakery_data['Items'].isin(top_items)]

## gist:f1a41feae17a7b3d720b9442c7dbf430
# Identifying the items that make up 80% of total sales
top_items = item_sales_counts.cumsum().sort_values(ascending=False)
top_items = top_items[top_items <= eighty_percent_threshold].index

# Filtering data for only the top items
filtered_data = bakery_data[bakery_data['Items'].isin(top_items)]

# Counting the number of each top item sold each day
daily_top_item_sales = filtered_data.groupby(['Date', 'Items']).size().unstack(fill_value=0)
	# Calculating the total number of rides for each station and bike type combination
	electric_bike_counts = df[df['rideable_type'] == 'electric_bike'].groupby('start_station_name')['ride_id'].count()
	classic_bike_counts = df[df['rideable_type'] == 'classic_bike'].groupby('start_station_name')['ride_id'].count()

	# Combining the counts for each station
	combined_bike_counts = electric_bike_counts.add(classic_bike_counts, fill_value=0)

	# Sorting stations by total rides and selecting the top stations
	sorted_combined_counts = combined_bike_counts.sort_values(ascending=False)
	cumulative_percentage_bikes = sorted_combined_counts.cumsum() / sorted_combined_counts.sum()
	# Calculating total rides for each station (both start and end)
	total_rides_per_station = df['start_station_name'].value_counts() + df['end_station_name'].value_counts()

	# Sorting stations by total rides and selecting the top 20%
	sorted_total_rides = total_rides_per_station.sort_values(ascending=False)
	cumulative_percentage_total = sorted_total_rides.cumsum() / sorted_total_rides.sum()
	top_20_total_stations = sorted_total_rides[cumulative_percentage_total <= 0.20]

	# Separating the counts for start and end stations for the top 20%
	top_20_start_counts = df['start_station_name'].value_counts().loc[top_20_total_stations.index]
	# Adjusting to select stations that make up the top 20% of all stations by count

	# Sorting the station counts
	sorted_start_stations = df['start_station_name'].value_counts().sort_values(ascending=False)
	sorted_end_stations = df['end_station_name'].value_counts().sort_values(ascending=False)

	# Calculating the cumulative percentage
	cumulative_percentage_start = sorted_start_stations.cumsum() / sorted_start_stations.sum()
	cumulative_percentage_end = sorted_end_stations.cumsum() / sorted_end_stations.sum()
	# Selecting categorical columns
	categorical_columns = df[['rideable_type', 'member_casual']]

	# Creating histograms for each categorical column
	plt.figure(figsize=(12, 6))

	# Rideable Type
	plt.subplot(1, 2, 1)
	df['rideable_type'].value_counts().plot(kind='bar', color='skyblue')
	plt.title('Rideable Type Frequency')
	import matplotlib.pyplot as plt

	# Creating separate histograms for each of the latitude and longitude columns
	df = divvy_tripdata
	plt.figure(figsize=(12, 8))

	# Start Latitude
	plt.subplot(2, 2, 1)
	plt.hist(df['start_lat'], bins=30, color='skyblue', edgecolor='black')
	plt.axvline(df['start_lat'].mean(), color='red', linestyle='dashed', linewidth=1)
	import pandas as pd

	# Load the new CSV file
	file_path_tripdata = '202311-divvy-tripdata.csv'
	divvy_tripdata = pd.read_csv(file_path_tripdata)

	# Display the first few rows of the dataframe to understand its structure and content
	divvy_tripdata
	from sklearn.cluster import KMeans

	# Clustering
	# Number of clusters
	n_clusters = 4
	kmeans = KMeans(n_clusters=n_clusters, random_state=0)
	clusters = kmeans.fit_predict(combined_top_items_daily)

	# Creating a DataFrame for the PCA components
	pca_df = pd.DataFrame(principal_components_combined, columns=[f'PC{i+1}' for i in range(n_components_combined)])
	# Total_Daily_Sales、Explained Variance、Cumulative Explained Varianceの列を除外
	data = pca_components_combined_df.drop(columns=['Total_Daily_Sales', 'Explained Variance', 'Cumulative Explained Variance'])

	# 各行の要素をヒストグラムで表示
	for index, row in data.iterrows():
	plt.figure(figsize=(8, 4)) # プロットのサイズを設定
	plt.xticks(rotation=90) # x軸ラベルを縦に表示
	plt.bar(row.index, row.values) # 棒グラフを作成
	plt.xlabel('Columns') # x軸ラベル
	plt.ylabel('Values') # y軸ラベル
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import StandardScaler
	import numpy as np

	# Filtering out the top items that make up 80% of the sales
	top_items = item_sales_counts[item_sales_counts.cumsum() <= eighty_percent_threshold].index
	print('Filtering out the top items that make up 80% of the sales : {}'.format(list(top_items)))

	# Selecting sales data for these top items
	filtered_top_items_data = bakery_data[bakery_data['Items'].isin(top_items)]
	# Identifying the items that make up 80% of total sales
	top_items = item_sales_counts.cumsum().sort_values(ascending=False)
	top_items = top_items[top_items <= eighty_percent_threshold].index

	# Filtering data for only the top items
	filtered_data = bakery_data[bakery_data['Items'].isin(top_items)]

	# Counting the number of each top item sold each day
	daily_top_item_sales = filtered_data.groupby(['Date', 'Items']).size().unstack(fill_value=0)