Skip to content

Instantly share code, notes, and snippets.

View andreaschandra's full-sized avatar
💼
work at anywhere

Andreas Chandra andreaschandra

💼
work at anywhere
View GitHub Profile
import numpy as np
from sklearn import preprocessing
input = np.array([[8,2,3],
[4,7,1],
[9,2,6]])
data_biner = preprocessing.Binarizer(threshold = 5).transform(input)
output:
[[1 0 0]
[0 1 0]
[1 0 1]]
# Sebelum mean removal
print("Mean = ", input.mean(), "\n", "Std deviation = ", input.std(axis = 0))
output:
Mean = 4.66666666667
Std deviation = [ 2.1602469 2.3570226 2.05480467]
# Sesudah mean removal
data_scaled = preprocessing.scale(input)
print("Mean = ", data_scaled)
data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled_minmax = data_scaler_minmax.fit_transform(input)
data_scaled_minmax
output:
array([[ 0.8, 0. , 0.4],
[ 0. , 1. , 0. ],
[ 1. , 0. , 1. ]])
data_normalized_l1 = preprocessing.normalize(input, norm='l1')
data_normalized_l2 = preprocessing.normalize(input, norm='l2')
print("\nL1 normalized data:\n", data_normalized_l1)
print("\nL2 normalized data:\n", data_normalized_l2)
L1 normalized data:
[[ 0.61538462 0.15384615 0.23076923]
[ 0.33333333 0.58333333 0.08333333]
[ 0.52941176 0.11764706 0.35294118]]
label_kategori = ['senin', 'selasa', 'rabu', 'kamis', 'jumat', 'sabtu', 'minggu']
encoder = preprocessing.LabelEncoder()
encoder.fit(label_kategori)
print("\nLabel mapping:")
for i, item in enumerate(encoder.classes_):
print(item, '>', i)
output:
Label mapping:
bucket = <your-bucket>
prefix = 'topic-kmeans'
import warnings
warnings.simplefilter("ignore")
import os
import boto3
import sagemaker
import numpy as np
# Load some categories from the training set
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
# Uncomment the following to do the analysis on all the categories
# categories = None
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(max_df = 0.5, max_features = 10000,
min_df = 2, stop_words = 'english',
use_idf = True)
X = vectorizer.fit_transform(dataset.data)
print("done in %fs" % (time() - t0))
minibatch = False
true_k = 4
if minibatch:
km = MiniBatchKMeans(n_clusters = true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose = False)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
verbose = False)