XinyueZ/ml_kNearestNeighbor.m

## ml_kNearestNeighbor.m
disp("k-nearest neighbors algorithm (Simple and plain implementation)");

function [prediction, selected_by_K, distances] = kNN(X, model, K, MODE)
    % X is a feature vector to query data in model with nearest distance.
    % model contains the "trained" data, actually for KNN we don't train and there aren't parameters,
    %       we compare distance with the ground-truth data.
    %       Last column of model is label column.
    % K is the first K result to select.
    % MODE for "regression" , "classification" or "sortOnly".

    m = size(model, 1);
    n_X = size(X, 2);

    X_truth = model(:, end - 1);
    Y_truth = model(:, end);

    X = repmat(X, m, 1);
    % Find distances between X and each ground-truth in model.
    distances = sum((X - X_truth).^2, 2);
    % Sort distances.
    [distances, I] = sort(distances, "ascend");
    % First K shorest rows of model will be selected, here is indices in model.
    selected_indices_by_K = I(1:K);
    % Get associated labels.
    selected_labels_by_K = Y_truth(selected_indices_by_K);

    % Prediction
    %
    % For regression we use average on K labels.
    if (strcmp(MODE, "regression") == 1)
        prediction = mean(selected_labels_by_K); %
    end

    % For classification we use the most frequent label in the top K lables.
    if (strcmp(MODE, "classification") == 1)
        prediction = mode(selected_labels_by_K); %
    end

    if (strcmp(MODE, "sortOnly") == 1)
        prediction = [];
    end

    % Optional: addition information of selected rows from model.
    selected_by_K = [selected_indices_by_K model(selected_indices_by_K, :)];
end

% MxN dim model, in each row the first is feature and later is regression label.
regression_model = [
                65.75, 112.99;
                71.52, 136.49;
                69.40, 153.03;
                68.22, 142.34;
                67.79, 144.30;
                68.70, 123.30;
                69.80, 141.49;
                70.01, 136.46;
                67.90, 112.37;
                66.49, 127.45;
                ]

% regression_pred = 128.25
% selected_by_K =
%      1.0000    65.7500   112.9900
%     10.0000    66.4900   127.4500
%      5.0000    67.7900   144.3000
% distances =
%     33.062
%     42.120
%     60.684
%     62.410
%     67.568
%     75.690
%     88.360
%     96.040
%    100.200
%    132.710

X = [60];
[regression_pred, selected_by_K, distances] = kNN(X, regression_model, 3, "regression");
regression_pred
selected_by_K
distances

% MxN dim model, in each row the first is feature and later is classification label.
classification_model = [
                    22, 1;
                    23, 1;
                    21, 1;
                    18, 1;
                    19, 1;
                    25, 0;
                    27, 0;
                    29, 0;
                    31, 0;
                    45, 0;
                    ]

% classification_pred = 0
% selected_by_K =
%     9   31    0
%     8   29    0
%     7   27    0

% distances =
%      4
%     16
%     36
%     64
%    100
%    121
%    144
%    144
%    196
%    225

X = [33];
[classification_pred, selected_by_K, distances] = kNN(X, classification_model, 3, "classification");
classification_pred
selected_by_K
distances

% Simple recommendation algorithm.
% Suppose we have list of movies. Each row is prepresented by style and taste.
% First two columns are movie ID and name.
% Last column is movie label, we do not use it for recommendation.
% The recommendation needs the sort as result only, there isn't regression or classification(although we can extend this topic in production).

filename = "movies_data.csv"; % Download https://dl.dropbox.com/s/93wnz4jvmqsjruu/movies_data.csv
model = csvread(filename);
model = model(2:end, :);
model = model(:, 3:end);

% A movie I am search on some website I build.
% In search UI I input "Queen of Katwe".
% The feature vector of this file shall be [7.4,1,1,0,0,0,0,0], exc. label and ID & name.
X = [7.4, 1, 1, 0, 0, 0, 0, 0];

% Do kNN without regression and classification.
[~, selected_by_K, distances] = kNN(X, model, 3, "sortOnly");
selected_by_K


function data = load_data(filename, NUMBER_OF_LINES)
    fid = fopen(filename, "r");

    data = cell (NUMBER_OF_LINES, 2);
    counter = 1;

    line = fgetl(fid);

    while line != -1
        separator_index = index(line, '|');
        data {counter, 1} = substr(line, 1, separator_index - 1); % Up to the separator
        data {counter, 2} = substr(line, separator_index + 1, length(line) - separator_index); % After the separator
        counter++;

        line = fgetl(fid);
    endwhile

    fclose(fid);
end

% Get the first 3 movies which are similar to "Queen of Katwe".
data = load_data(filename, size(model, 1));
data = data(2:end);
selected_rows = data(selected_by_K(:,1));
selected_rows
	disp("k-nearest neighbors algorithm (Simple and plain implementation)");

	function [prediction, selected_by_K, distances] = kNN(X, model, K, MODE)
	% X is a feature vector to query data in model with nearest distance.
	% model contains the "trained" data, actually for KNN we don't train and there aren't parameters,
	% we compare distance with the ground-truth data.
	% Last column of model is label column.
	% K is the first K result to select.
	% MODE for "regression" , "classification" or "sortOnly".

	m = size(model, 1);
	n_X = size(X, 2);

	X_truth = model(:, end - 1);
	Y_truth = model(:, end);

	X = repmat(X, m, 1);
	% Find distances between X and each ground-truth in model.
	distances = sum((X - X_truth).^2, 2);
	% Sort distances.
	[distances, I] = sort(distances, "ascend");
	% First K shorest rows of model will be selected, here is indices in model.
	selected_indices_by_K = I(1:K);
	% Get associated labels.
	selected_labels_by_K = Y_truth(selected_indices_by_K);

	% Prediction
	%
	% For regression we use average on K labels.
	if (strcmp(MODE, "regression") == 1)
	prediction = mean(selected_labels_by_K); %
	end

	% For classification we use the most frequent label in the top K lables.
	if (strcmp(MODE, "classification") == 1)
	prediction = mode(selected_labels_by_K); %
	end

	if (strcmp(MODE, "sortOnly") == 1)
	prediction = [];
	end

	% Optional: addition information of selected rows from model.
	selected_by_K = [selected_indices_by_K model(selected_indices_by_K, :)];
	end

	% MxN dim model, in each row the first is feature and later is regression label.
	regression_model = [
	65.75, 112.99;
	71.52, 136.49;
	69.40, 153.03;
	68.22, 142.34;
	67.79, 144.30;
	68.70, 123.30;
	69.80, 141.49;
	70.01, 136.46;
	67.90, 112.37;
	66.49, 127.45;
	]

	% regression_pred = 128.25
	% selected_by_K =
	% 1.0000 65.7500 112.9900
	% 10.0000 66.4900 127.4500
	% 5.0000 67.7900 144.3000
	% distances =
	% 33.062
	% 42.120
	% 60.684
	% 62.410
	% 67.568
	% 75.690
	% 88.360
	% 96.040
	% 100.200
	% 132.710

	X = [60];
	[regression_pred, selected_by_K, distances] = kNN(X, regression_model, 3, "regression");
	regression_pred
	selected_by_K
	distances

	% MxN dim model, in each row the first is feature and later is classification label.
	classification_model = [
	22, 1;
	23, 1;
	21, 1;
	18, 1;
	19, 1;
	25, 0;
	27, 0;
	29, 0;
	31, 0;
	45, 0;
	]

	% classification_pred = 0
	% selected_by_K =
	% 9 31 0
	% 8 29 0
	% 7 27 0

	% distances =
	% 4
	% 16
	% 36
	% 64
	% 100
	% 121
	% 144
	% 144
	% 196
	% 225

	X = [33];
	[classification_pred, selected_by_K, distances] = kNN(X, classification_model, 3, "classification");
	classification_pred
	selected_by_K
	distances

	% Simple recommendation algorithm.
	% Suppose we have list of movies. Each row is prepresented by style and taste.
	% First two columns are movie ID and name.
	% Last column is movie label, we do not use it for recommendation.
	% The recommendation needs the sort as result only, there isn't regression or classification(although we can extend this topic in production).

	filename = "movies_data.csv"; % Download https://dl.dropbox.com/s/93wnz4jvmqsjruu/movies_data.csv
	model = csvread(filename);
	model = model(2:end, :);
	model = model(:, 3:end);

	% A movie I am search on some website I build.
	% In search UI I input "Queen of Katwe".
	% The feature vector of this file shall be [7.4,1,1,0,0,0,0,0], exc. label and ID & name.
	X = [7.4, 1, 1, 0, 0, 0, 0, 0];

	% Do kNN without regression and classification.
	[~, selected_by_K, distances] = kNN(X, model, 3, "sortOnly");
	selected_by_K


	function data = load_data(filename, NUMBER_OF_LINES)
	fid = fopen(filename, "r");

	data = cell (NUMBER_OF_LINES, 2);
	counter = 1;

	line = fgetl(fid);

	while line != -1
	separator_index = index(line, '\|');
	data {counter, 1} = substr(line, 1, separator_index - 1); % Up to the separator
	data {counter, 2} = substr(line, separator_index + 1, length(line) - separator_index); % After the separator
	counter++;

	line = fgetl(fid);
	endwhile

	fclose(fid);
	end

	% Get the first 3 movies which are similar to "Queen of Katwe".
	data = load_data(filename, size(model, 1));
	data = data(2:end);
	selected_rows = data(selected_by_K(:,1));
	selected_rows