Skip to content

Instantly share code, notes, and snippets.

@singhay
Created March 27, 2016 02:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save singhay/a252cfadc7b8ab118923 to your computer and use it in GitHub Desktop.
Save singhay/a252cfadc7b8ab118923 to your computer and use it in GitHub Desktop.
Cute fish out of MNIST Handwritten Dataset [OC]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
'''
https://www.reddit.com/r/dataisbeautiful/comments/4c3zjt/cute_fish_out_of_mnist_handwritten_dataset_oc/
PCA Dimentionality Reduction of Handwritten Dataset from 784 to 2, normalizing and vizualizing.
'''
def main():
normalization_constant = 255
# Loading Train Dataset
dataFrame_train = pd.read_csv('a3_datasets/datasets/digits/train.csv')
train = np.array(dataFrame_train.iloc[0:, 1:] / normalization_constant)
train_label = np.array(dataFrame_train['label'])
# data = {1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 0: []}
# for pixels, label in zip(train, train_label):
# if len(data[label]) < num_of_samples:
# data[label].append(pixels)
# Count the number of digits for each label
# count = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 0: 0}
# for i in train_label:
# count[i] += 1
# print count
pca = PCA(2, whiten=True)
train = pca.fit_transform(train)
label_color = {1:'r', 2: 'b', 3: 'g', 4: 'c', 5: 'm', 6: 'y', 7: '0.75', 8: 'w', 9: '#87fc70', 0: '#ffc0cb'}
plt.scatter(train[:, 0], train[:, 1])
for label, x, y in zip(train_label, train[:, 0], train[:, 1]):
plt.annotate(label,
xy = (x, y), xytext = (0,0),
textcoords = 'offset points', ha = 'right', va = 'bottom',
bbox = dict(boxstyle = 'round, pad=0.5', fc = label_color[label], alpha = 0.5),
arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
# Graph
plt.title('MNIST Dataset reduced to 2 Components using PCA')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()
if __name__ == '__main__':
main()
''' OUTPUT
filename: 2000.png
Digits 0, and 1 are easily distinguishable but on the other hand
the only thing stopping me from stating 2 and 3 also as distinguishable
is the stark contrast in color I've set.
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment