Andreas Chandra andreaschandra

## import.py
import numpy as np
from sklearn import preprocessing

## biner.py
input = np.array([[8,2,3],
                  [4,7,1],
                  [9,2,6]])

data_biner = preprocessing.Binarizer(threshold = 5).transform(input)

output:
[[1 0 0]
 [0 1 0]
 [1 0 1]]

## meanremoval.py
# Sebelum mean removal
print("Mean = ", input.mean(), "\n", "Std deviation = ", input.std(axis = 0))

output:
Mean =  4.66666666667
Std deviation =  [ 2.1602469   2.3570226   2.05480467]

# Sesudah mean removal
data_scaled = preprocessing.scale(input)
print("Mean = ", data_scaled)

## scaling.py
data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled_minmax = data_scaler_minmax.fit_transform(input)
data_scaled_minmax

output:
array([[ 0.8,  0. ,  0.4],
       [ 0. ,  1. ,  0. ],
       [ 1. ,  0. ,  1. ]])

## normalization.py
data_normalized_l1 = preprocessing.normalize(input, norm='l1')
data_normalized_l2 = preprocessing.normalize(input, norm='l2')
print("\nL1 normalized data:\n", data_normalized_l1)
print("\nL2 normalized data:\n", data_normalized_l2)

L1 normalized data:
 [[ 0.61538462  0.15384615  0.23076923]
 [ 0.33333333  0.58333333  0.08333333]
 [ 0.52941176  0.11764706  0.35294118]]

## label_encoding.py
label_kategori = ['senin', 'selasa', 'rabu', 'kamis', 'jumat', 'sabtu', 'minggu']
encoder = preprocessing.LabelEncoder()
encoder.fit(label_kategori)

print("\nLabel mapping:")
for i, item in enumerate(encoder.classes_):
    print(item, '>', i)

output:
Label mapping:

## gist:268c632cf81d3c6592b566c01ffd4654
bucket = <your-bucket>
prefix = 'topic-kmeans'

import warnings
warnings.simplefilter("ignore")

import os
import boto3
import sagemaker
import numpy as np

## gist:ce8acf6f5d91b77c008a40a3b3c6e916
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
# Uncomment the following to do the analysis on all the categories
# categories = None

## Feature Extraction
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()

vectorizer = TfidfVectorizer(max_df = 0.5, max_features = 10000,
                                 min_df = 2, stop_words = 'english',
                                 use_idf = True)

X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))

## gist:f7140f27a7dfd0a34b9d6b232618292b
minibatch = False
true_k = 4

if minibatch:
    km = MiniBatchKMeans(n_clusters = true_k, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose = False)
else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose = False)
	input = np.array([[8,2,3],
	[4,7,1],
	[9,2,6]])

	data_biner = preprocessing.Binarizer(threshold = 5).transform(input)

	output:
	[[1 0 0]
	[0 1 0]
	[1 0 1]]
	# Sebelum mean removal
	print("Mean = ", input.mean(), "\n", "Std deviation = ", input.std(axis = 0))

	output:
	Mean = 4.66666666667
	Std deviation = [ 2.1602469 2.3570226 2.05480467]

	# Sesudah mean removal
	data_scaled = preprocessing.scale(input)
	print("Mean = ", data_scaled)
	data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0, 1))
	data_scaled_minmax = data_scaler_minmax.fit_transform(input)
	data_scaled_minmax

	output:
	array([[ 0.8, 0. , 0.4],
	[ 0. , 1. , 0. ],
	[ 1. , 0. , 1. ]])
	data_normalized_l1 = preprocessing.normalize(input, norm='l1')
	data_normalized_l2 = preprocessing.normalize(input, norm='l2')
	print("\nL1 normalized data:\n", data_normalized_l1)
	print("\nL2 normalized data:\n", data_normalized_l2)

	L1 normalized data:
	[[ 0.61538462 0.15384615 0.23076923]
	[ 0.33333333 0.58333333 0.08333333]
	[ 0.52941176 0.11764706 0.35294118]]
	label_kategori = ['senin', 'selasa', 'rabu', 'kamis', 'jumat', 'sabtu', 'minggu']
	encoder = preprocessing.LabelEncoder()
	encoder.fit(label_kategori)

	print("\nLabel mapping:")
	for i, item in enumerate(encoder.classes_):
	print(item, '>', i)

	output:
	Label mapping:
	bucket = <your-bucket>
	prefix = 'topic-kmeans'

	import warnings
	warnings.simplefilter("ignore")

	import os
	import boto3
	import sagemaker
	import numpy as np
	# Load some categories from the training set
	categories = [
	'alt.atheism',
	'talk.religion.misc',
	'comp.graphics',
	'sci.space',
	]
	# Uncomment the following to do the analysis on all the categories
	# categories = None
	print("Extracting features from the training dataset using a sparse vectorizer")
	t0 = time()

	vectorizer = TfidfVectorizer(max_df = 0.5, max_features = 10000,
	min_df = 2, stop_words = 'english',
	use_idf = True)

	X = vectorizer.fit_transform(dataset.data)

	print("done in %fs" % (time() - t0))
	minibatch = False
	true_k = 4

	if minibatch:
	km = MiniBatchKMeans(n_clusters = true_k, init='k-means++', n_init=1,
	init_size=1000, batch_size=1000, verbose = False)
	else:
	km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
	verbose = False)