jsun/machine_learning_pca.py

## machine_learning_pca.py
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# get training and test sets
x_train, x_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=0.2, random_state=0)
print(x_train.shape)
print(x_test.shape)

# standardization
scaler = StandardScaler()

# calculate scaling parameters from training data set and scale the training data set
scaler.fit(x_train)
x_train = scaler.transform(x_train)

# PCA for training data set, only the first 80%  principal components will be choosed
pca = PCA(0.80)
pca.fit(x_train)
print(pca.n_components_)
x_train = pca.transform(x_train)

# use logistic regresssion for machine learning
clf = LogisticRegression()
clf.fit(x_train, y_train)


# prediction
x_test = scaler.transform(x_test)
x_test = pca.transform(x_test)
y_pred = clf.predict(x_test)

confusion_matrix(y_test, y_pred)
## array([[1277,    0,    2,    2,    4,    7,   13,    0,    7,    0],
##        [   0, 1555,   12,    3,    1,    7,    1,    3,   19,    3],
##        [  10,   12, 1211,   21,   19,    5,   14,   18,   33,    5],
##        [   6,   15,   37, 1260,    0,   46,    4,   11,   32,   16],
##        [   5,    9,   10,    4, 1252,    2,   13,    4,    9,   54],
##        [  14,    7,   14,   50,   20, 1086,   25,    7,   41,   16],
##        [   6,    3,    9,    0,    4,   12, 1357,    1,    3,    2],
##        [   9,    5,   18,    7,   18,    3,    0, 1351,    4,   46],
##        [   9,   33,   15,   46,   10,   41,   14,    4, 1196,   22],
##        [  11,    7,   10,   29,   58,   10,    1,   50,    7, 1236]])
	from sklearn.datasets import fetch_mldata
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import confusion_matrix

	# get training and test sets
	x_train, x_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=0.2, random_state=0)
	print(x_train.shape)
	print(x_test.shape)

	# standardization
	scaler = StandardScaler()

	# calculate scaling parameters from training data set and scale the training data set
	scaler.fit(x_train)
	x_train = scaler.transform(x_train)

	# PCA for training data set, only the first 80% principal components will be choosed
	pca = PCA(0.80)
	pca.fit(x_train)
	print(pca.n_components_)
	x_train = pca.transform(x_train)

	# use logistic regresssion for machine learning
	clf = LogisticRegression()
	clf.fit(x_train, y_train)


	# prediction
	x_test = scaler.transform(x_test)
	x_test = pca.transform(x_test)
	y_pred = clf.predict(x_test)

	confusion_matrix(y_test, y_pred)
	## array([[1277, 0, 2, 2, 4, 7, 13, 0, 7, 0],
	## [ 0, 1555, 12, 3, 1, 7, 1, 3, 19, 3],
	## [ 10, 12, 1211, 21, 19, 5, 14, 18, 33, 5],
	## [ 6, 15, 37, 1260, 0, 46, 4, 11, 32, 16],
	## [ 5, 9, 10, 4, 1252, 2, 13, 4, 9, 54],
	## [ 14, 7, 14, 50, 20, 1086, 25, 7, 41, 16],
	## [ 6, 3, 9, 0, 4, 12, 1357, 1, 3, 2],
	## [ 9, 5, 18, 7, 18, 3, 0, 1351, 4, 46],
	## [ 9, 33, 15, 46, 10, 41, 14, 4, 1196, 22],
	## [ 11, 7, 10, 29, 58, 10, 1, 50, 7, 1236]])