KarlHerler/featureNormalize.m

## featureNormalize.m
function [X_norm, mu, sigma] = featureNormalize(X)
%FEATURENORMALIZE Normalizes the features in X
%   FEATURENORMALIZE(X) returns a normalized version of X where
%   the mean value of each feature is 0 and the standard deviation
%   is 1. This is often a good preprocessing step to do when
%   working with learning algorithms.

mu = mean(X);
X_norm = bsxfun(@minus, X, mu);

sigma = std(X_norm);
X_norm = bsxfun(@rdivide, X_norm, sigma);


% ============================================================

end

## gistfile1.matlab
% Data analysis and knowledge discovery ex1


% Load the dataset
dataset = dlmread("iris_matrixed.dat", " ");
features = ['Sepal length';'Sepal width';'Petal length';'Pedal width';'Species'];


% Public: Strugers' formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function k = strugers(n)
	k = ceil(log2(n+1));
end

% Public: Scotts' normal reference formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function h = scotts(n)
	h = ceil(3.5 * sqrt(n) * n^(-1/3));
end

% Public: Square-root choice formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function k = square_choice(n)
	k = ceil(sqrt(n));
end

% Public: Freedman–Diaconis' choice formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function h = freedman_diaconis(n, vect)
	h = 2 * iqr(vect) * n^(-1/3);
end

% Public: Strugers' formula for bin size computation
% n - the cardinality of the sample
%
% returns the number of bins
function r = l2_risk_min(n)
	r = n;
end

function k = number_of_bins(vect, h)
	k = ceil((max(vect)-min(vect))/h);
end


n = length(dataset); 				% the size of the dataset
m_labels = length(dataset(1, :));	% the number of features in the dataset incl labels
m = m_labels - 1;					% the number of features in the dataset

disp(sprintf('Size of the dataset: n = %d and m = %d', n, m));disp("");
disp('Bin sizes with with various choice strategies');disp("");

stru = strugers(n)
scot = scotts(n)
sqch = square_choice(n)
free = number_of_bins(dataset(:, 3), freedman_diaconis(n, dataset(:, 3)))


function histy(i, h, method, dataset, features)
	pause;
	disp(sprintf('Feature: %s - method: %s', features(i,:), method));
	hist(dataset(:, i), h)
	title(sprintf('Feature: %s - method: %s', features(i,:), method))
end


%for i = 1:m
%	histy(i, stru, "Strugers", dataset, features)
%	histy(i, scot, "Scotts", dataset, features)
%	histy(i, sqch, "Square-root choice", dataset, features)
%	histy(i, free, "Freedman–Diaconis choice", dataset, features)
%	disp("")
%	pause;
%end

labelless = dataset(:,[1:4]);

%% Task 2 - boxplots
%boxplot ({dataset(:, 1), dataset(:, 2), dataset(:, 3), dataset(:, 4)}, 1)
%set(gca,'XTickLabel', {'', 'Sepal length', 'Sepal width', 'Petal length', 'Pedal width'})
%pause;


%% Task 3 - scatterplot
%plotmatrix(labelless)
%pause;

%% Task 4 PCA
[normalized_dataset, mu, sigma] = featureNormalize(labelless); % We must normalize the dataset
[m, n] = size(normalized_dataset); % grab the size
sigma = (normalized_dataset'*normalized_dataset)/m %grab the covariance matrix of the dataset


% perform Singular value decomposition on the normalized dataset in order to extract
% eigenvectors of the covariance matrix.
[U, S, V] = svd(sigma);

Z = labelless*U(:, 1:2); % Project the dataset over two features
scatter(Z(:,1), Z(:, 2))
%plot(Z)

%% Task 5 - 2D MDS
%% tehee I haven't done this


%% Task 6 - Paralell coords
%plot(normalized_dataset')
% do some more magic with the labling

%% Task 7 - Spearmans and Kendalls
disp("Spearman's rho");
spearman(labelless)
disp("Kendalls's tau");
kendall(labelless)
	function [X_norm, mu, sigma] = featureNormalize(X)
	%FEATURENORMALIZE Normalizes the features in X
	% FEATURENORMALIZE(X) returns a normalized version of X where
	% the mean value of each feature is 0 and the standard deviation
	% is 1. This is often a good preprocessing step to do when
	% working with learning algorithms.

	mu = mean(X);
	X_norm = bsxfun(@minus, X, mu);

	sigma = std(X_norm);
	X_norm = bsxfun(@rdivide, X_norm, sigma);


	% ============================================================

	end
	% Data analysis and knowledge discovery ex1


	% Load the dataset
	dataset = dlmread("iris_matrixed.dat", " ");
	features = ['Sepal length';'Sepal width';'Petal length';'Pedal width';'Species'];


	% Public: Strugers' formula for bin size computation
	% n - the cardinality of the sample
	%
	% returns the number of bins
	function k = strugers(n)
	k = ceil(log2(n+1));
	end

	% Public: Scotts' normal reference formula for bin size computation
	% n - the cardinality of the sample
	%
	% returns the number of bins
	function h = scotts(n)
	h = ceil(3.5 * sqrt(n) * n^(-1/3));
	end

	% Public: Square-root choice formula for bin size computation
	% n - the cardinality of the sample
	%
	% returns the number of bins
	function k = square_choice(n)
	k = ceil(sqrt(n));
	end

	% Public: Freedman–Diaconis' choice formula for bin size computation
	% n - the cardinality of the sample
	%
	% returns the number of bins
	function h = freedman_diaconis(n, vect)
	h = 2 * iqr(vect) * n^(-1/3);
	end

	% Public: Strugers' formula for bin size computation
	% n - the cardinality of the sample
	%
	% returns the number of bins
	function r = l2_risk_min(n)
	r = n;
	end

	function k = number_of_bins(vect, h)
	k = ceil((max(vect)-min(vect))/h);
	end


	n = length(dataset); % the size of the dataset
	m_labels = length(dataset(1, :)); % the number of features in the dataset incl labels
	m = m_labels - 1; % the number of features in the dataset

	disp(sprintf('Size of the dataset: n = %d and m = %d', n, m));disp("");
	disp('Bin sizes with with various choice strategies');disp("");

	stru = strugers(n)
	scot = scotts(n)
	sqch = square_choice(n)
	free = number_of_bins(dataset(:, 3), freedman_diaconis(n, dataset(:, 3)))


	function histy(i, h, method, dataset, features)
	pause;
	disp(sprintf('Feature: %s - method: %s', features(i,:), method));
	hist(dataset(:, i), h)
	title(sprintf('Feature: %s - method: %s', features(i,:), method))
	end



	%for i = 1:m
	% histy(i, stru, "Strugers", dataset, features)
	% histy(i, scot, "Scotts", dataset, features)
	% histy(i, sqch, "Square-root choice", dataset, features)
	% histy(i, free, "Freedman–Diaconis choice", dataset, features)
	% disp("")
	% pause;
	%end

	labelless = dataset(:,[1:4]);

	%% Task 2 - boxplots
	%boxplot ({dataset(:, 1), dataset(:, 2), dataset(:, 3), dataset(:, 4)}, 1)
	%set(gca,'XTickLabel', {'', 'Sepal length', 'Sepal width', 'Petal length', 'Pedal width'})
	%pause;


	%% Task 3 - scatterplot
	%plotmatrix(labelless)
	%pause;

	%% Task 4 PCA
	[normalized_dataset, mu, sigma] = featureNormalize(labelless); % We must normalize the dataset
	[m, n] = size(normalized_dataset); % grab the size
	sigma = (normalized_dataset'*normalized_dataset)/m %grab the covariance matrix of the dataset


	% perform Singular value decomposition on the normalized dataset in order to extract
	% eigenvectors of the covariance matrix.
	[U, S, V] = svd(sigma);

	Z = labelless*U(:, 1:2); % Project the dataset over two features
	scatter(Z(:,1), Z(:, 2))
	%plot(Z)

	%% Task 5 - 2D MDS
	%% tehee I haven't done this


	%% Task 6 - Paralell coords
	%plot(normalized_dataset')
	% do some more magic with the labling

	%% Task 7 - Spearmans and Kendalls
	disp("Spearman's rho");
	spearman(labelless)
	disp("Kendalls's tau");
	kendall(labelless)