Skip to content

Instantly share code, notes, and snippets.

@smzn
Last active December 27, 2023 01:23
Show Gist options
  • Save smzn/a61154820c85a1957154ba63a837d378 to your computer and use it in GitHub Desktop.
Save smzn/a61154820c85a1957154ba63a837d378 to your computer and use it in GitHub Desktop.
Applying PCA to the combined data
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np
# Filtering out the top items that make up 80% of the sales
top_items = item_sales_counts[item_sales_counts.cumsum() <= eighty_percent_threshold].index
print('Filtering out the top items that make up 80% of the sales : {}'.format(list(top_items)))
# Selecting sales data for these top items
filtered_top_items_data = bakery_data[bakery_data['Items'].isin(top_items)]
daily_top_items_sales = filtered_top_items_data.groupby(['Date', 'Items']).size().unstack(fill_value=0)
# Adding the daily total sales to the daily top items sales data
daily_top_items_sales_with_total = daily_top_items_sales.join(daily_item_count.rename('Total_Daily_Sales'))
# Standardizing the combined data
scaler = StandardScaler()
standardized_combined_data = scaler.fit_transform(daily_top_items_sales_with_total.fillna(0))
# Applying PCA to the combined data
pca_combined = PCA(n_components=0.80)
principal_components_combined = pca_combined.fit_transform(standardized_combined_data)
# Number of components PCA chose
n_components_combined = pca_combined.n_components_
# Creating a DataFrame for the PCA components including the total daily sales
pca_components_combined_df = pd.DataFrame(pca_combined.components_, columns=daily_top_items_sales_with_total.columns)
# Calculating the explained variance for each principal component
explained_variance_combined = pca_combined.explained_variance_ratio_
# Calculating cumulative explained variance
cumulative_explained_variance = np.cumsum(explained_variance_combined)
# Re-applying PCA to the combined data (including the total daily sales)
pca_combined = PCA(n_components=0.80)
principal_components_combined = pca_combined.fit_transform(standardized_combined_data)
# Adding these values to the DataFrame
pca_components_combined_df['Explained Variance'] = explained_variance_combined
pca_components_combined_df['Cumulative Explained Variance'] = cumulative_explained_variance
# Plotting the biplot for the combined data
def biplot(score, coeff, labels=None):
xs = score[:,0]
ys = score[:,1]
n = coeff.shape[0]
scalex = 1.0/(xs.max() - xs.min())
scaley = 1.0/(ys.max() - ys.min())
plt.scatter(xs * scalex, ys * scaley)
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1], color='r', alpha=0.5)
if labels is not None:
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color='g', ha='center', va='center')
plt.figure(figsize=(12, 8))
biplot(principal_components_combined, np.transpose(pca_combined.components_[0:2, :]), labels=daily_top_items_sales_with_total.columns)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Biplot of PCA (Including Total Daily Sales)')
plt.show()
pca_components_combined_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment