makispl/pca4clustering.py

## pca4clustering.py
# read in the training data
plays_df = pd.read_csv('../data/interim/plays_17_18_19_pre_proc_train.csv',
                       converters={'GAME_ID': lambda x: str(x)})

# switch to the for-normalisation-features
data_stnd = data.copy()

# instantiate, fit, transform scaler
scaler = MinMaxScaler()
data_stnd = scaler.fit_transform(data_stnd)

# instantiate pca
pca = PCA()
pca.fit(data_stnd)
pca.explained_variance_ratio_

# Visualize the variance to locate the # of principal components
with plt.style.context('fivethirtyeight'):
    fig = plt.figure(figsize=(10, 8))
    plt.figure(figsize=(10, 8))
    plt.plot(range(1, 19), pca.explained_variance_ratio_.cumsum(), markerfacecolor='red', marker='o', linestyle='--')
    plt.xlabel('Principal Components', fontsize=18)
    plt.ylabel('Explained Variance %', fontsize=18)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

# instantiate pca for 4 components - explain > 80% of variance
pca = PCA(n_components = 4)
pca.fit(data_stnd)
pca_scores = pca.transform(data_stnd)
	# read in the training data
	plays_df = pd.read_csv('../data/interim/plays_17_18_19_pre_proc_train.csv',
	converters={'GAME_ID': lambda x: str(x)})

	# switch to the for-normalisation-features
	data_stnd = data.copy()

	# instantiate, fit, transform scaler
	scaler = MinMaxScaler()
	data_stnd = scaler.fit_transform(data_stnd)

	# instantiate pca
	pca = PCA()
	pca.fit(data_stnd)
	pca.explained_variance_ratio_

	# Visualize the variance to locate the # of principal components
	with plt.style.context('fivethirtyeight'):
	fig = plt.figure(figsize=(10, 8))
	plt.figure(figsize=(10, 8))
	plt.plot(range(1, 19), pca.explained_variance_ratio_.cumsum(), markerfacecolor='red', marker='o', linestyle='--')
	plt.xlabel('Principal Components', fontsize=18)
	plt.ylabel('Explained Variance %', fontsize=18)
	plt.xticks(fontsize=16)
	plt.yticks(fontsize=16)

	# instantiate pca for 4 components - explain > 80% of variance
	pca = PCA(n_components = 4)
	pca.fit(data_stnd)
	pca_scores = pca.transform(data_stnd)