smzn/gist:a61154820c85a1957154ba63a837d378

## gistfile1.txt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

# Filtering out the top items that make up 80% of the sales
top_items = item_sales_counts[item_sales_counts.cumsum() <= eighty_percent_threshold].index
print('Filtering out the top items that make up 80% of the sales : {}'.format(list(top_items)))

# Selecting sales data for these top items
filtered_top_items_data = bakery_data[bakery_data['Items'].isin(top_items)]
daily_top_items_sales = filtered_top_items_data.groupby(['Date', 'Items']).size().unstack(fill_value=0)

# Adding the daily total sales to the daily top items sales data
daily_top_items_sales_with_total = daily_top_items_sales.join(daily_item_count.rename('Total_Daily_Sales'))

# Standardizing the combined data
scaler = StandardScaler()
standardized_combined_data = scaler.fit_transform(daily_top_items_sales_with_total.fillna(0))

# Applying PCA to the combined data
pca_combined = PCA(n_components=0.80)
principal_components_combined = pca_combined.fit_transform(standardized_combined_data)

# Number of components PCA chose
n_components_combined = pca_combined.n_components_

# Creating a DataFrame for the PCA components including the total daily sales
pca_components_combined_df = pd.DataFrame(pca_combined.components_, columns=daily_top_items_sales_with_total.columns)

# Calculating the explained variance for each principal component
explained_variance_combined = pca_combined.explained_variance_ratio_

# Calculating cumulative explained variance
cumulative_explained_variance = np.cumsum(explained_variance_combined)

# Re-applying PCA to the combined data (including the total daily sales)
pca_combined = PCA(n_components=0.80)
principal_components_combined = pca_combined.fit_transform(standardized_combined_data)

# Adding these values to the DataFrame
pca_components_combined_df['Explained Variance'] = explained_variance_combined
pca_components_combined_df['Cumulative Explained Variance'] = cumulative_explained_variance

# Plotting the biplot for the combined data
def biplot(score, coeff, labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex, ys * scaley)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1], color='r', alpha=0.5)
        if labels is not None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color='g', ha='center', va='center')

plt.figure(figsize=(12, 8))
biplot(principal_components_combined, np.transpose(pca_combined.components_[0:2, :]), labels=daily_top_items_sales_with_total.columns)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Biplot of PCA (Including Total Daily Sales)')
plt.show()

pca_components_combined_df
	from sklearn.decomposition import PCA
	from sklearn.preprocessing import StandardScaler
	import numpy as np

	# Filtering out the top items that make up 80% of the sales
	top_items = item_sales_counts[item_sales_counts.cumsum() <= eighty_percent_threshold].index
	print('Filtering out the top items that make up 80% of the sales : {}'.format(list(top_items)))

	# Selecting sales data for these top items
	filtered_top_items_data = bakery_data[bakery_data['Items'].isin(top_items)]
	daily_top_items_sales = filtered_top_items_data.groupby(['Date', 'Items']).size().unstack(fill_value=0)

	# Adding the daily total sales to the daily top items sales data
	daily_top_items_sales_with_total = daily_top_items_sales.join(daily_item_count.rename('Total_Daily_Sales'))

	# Standardizing the combined data
	scaler = StandardScaler()
	standardized_combined_data = scaler.fit_transform(daily_top_items_sales_with_total.fillna(0))

	# Applying PCA to the combined data
	pca_combined = PCA(n_components=0.80)
	principal_components_combined = pca_combined.fit_transform(standardized_combined_data)

	# Number of components PCA chose
	n_components_combined = pca_combined.n_components_

	# Creating a DataFrame for the PCA components including the total daily sales
	pca_components_combined_df = pd.DataFrame(pca_combined.components_, columns=daily_top_items_sales_with_total.columns)

	# Calculating the explained variance for each principal component
	explained_variance_combined = pca_combined.explained_variance_ratio_

	# Calculating cumulative explained variance
	cumulative_explained_variance = np.cumsum(explained_variance_combined)

	# Re-applying PCA to the combined data (including the total daily sales)
	pca_combined = PCA(n_components=0.80)
	principal_components_combined = pca_combined.fit_transform(standardized_combined_data)

	# Adding these values to the DataFrame
	pca_components_combined_df['Explained Variance'] = explained_variance_combined
	pca_components_combined_df['Cumulative Explained Variance'] = cumulative_explained_variance

	# Plotting the biplot for the combined data
	def biplot(score, coeff, labels=None):
	xs = score[:,0]
	ys = score[:,1]
	n = coeff.shape[0]
	scalex = 1.0/(xs.max() - xs.min())
	scaley = 1.0/(ys.max() - ys.min())
	plt.scatter(xs * scalex, ys * scaley)
	for i in range(n):
	plt.arrow(0, 0, coeff[i,0], coeff[i,1], color='r', alpha=0.5)
	if labels is not None:
	plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color='g', ha='center', va='center')

	plt.figure(figsize=(12, 8))
	biplot(principal_components_combined, np.transpose(pca_combined.components_[0:2, :]), labels=daily_top_items_sales_with_total.columns)
	plt.xlabel('PC1')
	plt.ylabel('PC2')
	plt.title('Biplot of PCA (Including Total Daily Sales)')
	plt.show()

	pca_components_combined_df