Last active
December 27, 2023 01:23
-
-
Save smzn/a61154820c85a1957154ba63a837d378 to your computer and use it in GitHub Desktop.
Applying PCA to the combined data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.decomposition import PCA | |
from sklearn.preprocessing import StandardScaler | |
import numpy as np | |
# Filtering out the top items that make up 80% of the sales | |
top_items = item_sales_counts[item_sales_counts.cumsum() <= eighty_percent_threshold].index | |
print('Filtering out the top items that make up 80% of the sales : {}'.format(list(top_items))) | |
# Selecting sales data for these top items | |
filtered_top_items_data = bakery_data[bakery_data['Items'].isin(top_items)] | |
daily_top_items_sales = filtered_top_items_data.groupby(['Date', 'Items']).size().unstack(fill_value=0) | |
# Adding the daily total sales to the daily top items sales data | |
daily_top_items_sales_with_total = daily_top_items_sales.join(daily_item_count.rename('Total_Daily_Sales')) | |
# Standardizing the combined data | |
scaler = StandardScaler() | |
standardized_combined_data = scaler.fit_transform(daily_top_items_sales_with_total.fillna(0)) | |
# Applying PCA to the combined data | |
pca_combined = PCA(n_components=0.80) | |
principal_components_combined = pca_combined.fit_transform(standardized_combined_data) | |
# Number of components PCA chose | |
n_components_combined = pca_combined.n_components_ | |
# Creating a DataFrame for the PCA components including the total daily sales | |
pca_components_combined_df = pd.DataFrame(pca_combined.components_, columns=daily_top_items_sales_with_total.columns) | |
# Calculating the explained variance for each principal component | |
explained_variance_combined = pca_combined.explained_variance_ratio_ | |
# Calculating cumulative explained variance | |
cumulative_explained_variance = np.cumsum(explained_variance_combined) | |
# Re-applying PCA to the combined data (including the total daily sales) | |
pca_combined = PCA(n_components=0.80) | |
principal_components_combined = pca_combined.fit_transform(standardized_combined_data) | |
# Adding these values to the DataFrame | |
pca_components_combined_df['Explained Variance'] = explained_variance_combined | |
pca_components_combined_df['Cumulative Explained Variance'] = cumulative_explained_variance | |
# Plotting the biplot for the combined data | |
def biplot(score, coeff, labels=None): | |
xs = score[:,0] | |
ys = score[:,1] | |
n = coeff.shape[0] | |
scalex = 1.0/(xs.max() - xs.min()) | |
scaley = 1.0/(ys.max() - ys.min()) | |
plt.scatter(xs * scalex, ys * scaley) | |
for i in range(n): | |
plt.arrow(0, 0, coeff[i,0], coeff[i,1], color='r', alpha=0.5) | |
if labels is not None: | |
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color='g', ha='center', va='center') | |
plt.figure(figsize=(12, 8)) | |
biplot(principal_components_combined, np.transpose(pca_combined.components_[0:2, :]), labels=daily_top_items_sales_with_total.columns) | |
plt.xlabel('PC1') | |
plt.ylabel('PC2') | |
plt.title('Biplot of PCA (Including Total Daily Sales)') | |
plt.show() | |
pca_components_combined_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment