Skip to content

Instantly share code, notes, and snippets.

@KarlHerler
Created October 3, 2012 17:43
Show Gist options
  • Save KarlHerler/3828529 to your computer and use it in GitHub Desktop.
Save KarlHerler/3828529 to your computer and use it in GitHub Desktop.
function [X_norm, mu, sigma] = featureNormalize(X)
%FEATURENORMALIZE Normalizes the features in X
% FEATURENORMALIZE(X) returns a normalized version of X where
% the mean value of each feature is 0 and the standard deviation
% is 1. This is often a good preprocessing step to do when
% working with learning algorithms.
mu = mean(X);
X_norm = bsxfun(@minus, X, mu);
sigma = std(X_norm);
X_norm = bsxfun(@rdivide, X_norm, sigma);
% ============================================================
end
% Data analysis and knowledge discovery ex1
% Load the dataset
dataset = dlmread("iris_matrixed.dat", " ");
features = ['Sepal length';'Sepal width';'Petal length';'Pedal width';'Species'];
% Public: Strugers' formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function k = strugers(n)
k = ceil(log2(n+1));
end
% Public: Scotts' normal reference formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function h = scotts(n)
h = ceil(3.5 * sqrt(n) * n^(-1/3));
end
% Public: Square-root choice formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function k = square_choice(n)
k = ceil(sqrt(n));
end
% Public: Freedman–Diaconis' choice formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function h = freedman_diaconis(n, vect)
h = 2 * iqr(vect) * n^(-1/3);
end
% Public: Strugers' formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function r = l2_risk_min(n)
r = n;
end
function k = number_of_bins(vect, h)
k = ceil((max(vect)-min(vect))/h);
end
n = length(dataset); % the size of the dataset
m_labels = length(dataset(1, :)); % the number of features in the dataset incl labels
m = m_labels - 1; % the number of features in the dataset
disp(sprintf('Size of the dataset: n = %d and m = %d', n, m));disp("");
disp('Bin sizes with with various choice strategies');disp("");
stru = strugers(n)
scot = scotts(n)
sqch = square_choice(n)
free = number_of_bins(dataset(:, 3), freedman_diaconis(n, dataset(:, 3)))
function histy(i, h, method, dataset, features)
pause;
disp(sprintf('Feature: %s - method: %s', features(i,:), method));
hist(dataset(:, i), h)
title(sprintf('Feature: %s - method: %s', features(i,:), method))
end
%for i = 1:m
% histy(i, stru, "Strugers", dataset, features)
% histy(i, scot, "Scotts", dataset, features)
% histy(i, sqch, "Square-root choice", dataset, features)
% histy(i, free, "Freedman–Diaconis choice", dataset, features)
% disp("")
% pause;
%end
labelless = dataset(:,[1:4]);
%% Task 2 - boxplots
%boxplot ({dataset(:, 1), dataset(:, 2), dataset(:, 3), dataset(:, 4)}, 1)
%set(gca,'XTickLabel', {'', 'Sepal length', 'Sepal width', 'Petal length', 'Pedal width'})
%pause;
%% Task 3 - scatterplot
%plotmatrix(labelless)
%pause;
%% Task 4 PCA
[normalized_dataset, mu, sigma] = featureNormalize(labelless); % We must normalize the dataset
[m, n] = size(normalized_dataset); % grab the size
sigma = (normalized_dataset'*normalized_dataset)/m %grab the covariance matrix of the dataset
% perform Singular value decomposition on the normalized dataset in order to extract
% eigenvectors of the covariance matrix.
[U, S, V] = svd(sigma);
Z = labelless*U(:, 1:2); % Project the dataset over two features
scatter(Z(:,1), Z(:, 2))
%plot(Z)
%% Task 5 - 2D MDS
%% tehee I haven't done this
%% Task 6 - Paralell coords
%plot(normalized_dataset')
% do some more magic with the labling
%% Task 7 - Spearmans and Kendalls
disp("Spearman's rho");
spearman(labelless)
disp("Kendalls's tau");
kendall(labelless)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment