Last active
December 13, 2021 16:35
-
-
Save XinyueZ/91aee5bf6f7aeae5c5ead76f5ff5cff7 to your computer and use it in GitHub Desktop.
k-nearest neighbors algorithm (Simple and plain implementation)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
disp("k-nearest neighbors algorithm (Simple and plain implementation)"); | |
function [prediction, selected_by_K, distances] = kNN(X, model, K, MODE) | |
% X is a feature vector to query data in model with nearest distance. | |
% model contains the "trained" data, actually for KNN we don't train and there aren't parameters, | |
% we compare distance with the ground-truth data. | |
% Last column of model is label column. | |
% K is the first K result to select. | |
% MODE for "regression" , "classification" or "sortOnly". | |
m = size(model, 1); | |
n_X = size(X, 2); | |
X_truth = model(:, end - 1); | |
Y_truth = model(:, end); | |
X = repmat(X, m, 1); | |
% Find distances between X and each ground-truth in model. | |
distances = sum((X - X_truth).^2, 2); | |
% Sort distances. | |
[distances, I] = sort(distances, "ascend"); | |
% First K shorest rows of model will be selected, here is indices in model. | |
selected_indices_by_K = I(1:K); | |
% Get associated labels. | |
selected_labels_by_K = Y_truth(selected_indices_by_K); | |
% Prediction | |
% | |
% For regression we use average on K labels. | |
if (strcmp(MODE, "regression") == 1) | |
prediction = mean(selected_labels_by_K); % | |
end | |
% For classification we use the most frequent label in the top K lables. | |
if (strcmp(MODE, "classification") == 1) | |
prediction = mode(selected_labels_by_K); % | |
end | |
if (strcmp(MODE, "sortOnly") == 1) | |
prediction = []; | |
end | |
% Optional: addition information of selected rows from model. | |
selected_by_K = [selected_indices_by_K model(selected_indices_by_K, :)]; | |
end | |
% MxN dim model, in each row the first is feature and later is regression label. | |
regression_model = [ | |
65.75, 112.99; | |
71.52, 136.49; | |
69.40, 153.03; | |
68.22, 142.34; | |
67.79, 144.30; | |
68.70, 123.30; | |
69.80, 141.49; | |
70.01, 136.46; | |
67.90, 112.37; | |
66.49, 127.45; | |
] | |
% regression_pred = 128.25 | |
% selected_by_K = | |
% 1.0000 65.7500 112.9900 | |
% 10.0000 66.4900 127.4500 | |
% 5.0000 67.7900 144.3000 | |
% distances = | |
% 33.062 | |
% 42.120 | |
% 60.684 | |
% 62.410 | |
% 67.568 | |
% 75.690 | |
% 88.360 | |
% 96.040 | |
% 100.200 | |
% 132.710 | |
X = [60]; | |
[regression_pred, selected_by_K, distances] = kNN(X, regression_model, 3, "regression"); | |
regression_pred | |
selected_by_K | |
distances | |
% MxN dim model, in each row the first is feature and later is classification label. | |
classification_model = [ | |
22, 1; | |
23, 1; | |
21, 1; | |
18, 1; | |
19, 1; | |
25, 0; | |
27, 0; | |
29, 0; | |
31, 0; | |
45, 0; | |
] | |
% classification_pred = 0 | |
% selected_by_K = | |
% 9 31 0 | |
% 8 29 0 | |
% 7 27 0 | |
% distances = | |
% 4 | |
% 16 | |
% 36 | |
% 64 | |
% 100 | |
% 121 | |
% 144 | |
% 144 | |
% 196 | |
% 225 | |
X = [33]; | |
[classification_pred, selected_by_K, distances] = kNN(X, classification_model, 3, "classification"); | |
classification_pred | |
selected_by_K | |
distances | |
% Simple recommendation algorithm. | |
% Suppose we have list of movies. Each row is prepresented by style and taste. | |
% First two columns are movie ID and name. | |
% Last column is movie label, we do not use it for recommendation. | |
% The recommendation needs the sort as result only, there isn't regression or classification(although we can extend this topic in production). | |
filename = "movies_data.csv"; % Download https://dl.dropbox.com/s/93wnz4jvmqsjruu/movies_data.csv | |
model = csvread(filename); | |
model = model(2:end, :); | |
model = model(:, 3:end); | |
% A movie I am search on some website I build. | |
% In search UI I input "Queen of Katwe". | |
% The feature vector of this file shall be [7.4,1,1,0,0,0,0,0], exc. label and ID & name. | |
X = [7.4, 1, 1, 0, 0, 0, 0, 0]; | |
% Do kNN without regression and classification. | |
[~, selected_by_K, distances] = kNN(X, model, 3, "sortOnly"); | |
selected_by_K | |
function data = load_data(filename, NUMBER_OF_LINES) | |
fid = fopen(filename, "r"); | |
data = cell (NUMBER_OF_LINES, 2); | |
counter = 1; | |
line = fgetl(fid); | |
while line != -1 | |
separator_index = index(line, '|'); | |
data {counter, 1} = substr(line, 1, separator_index - 1); % Up to the separator | |
data {counter, 2} = substr(line, separator_index + 1, length(line) - separator_index); % After the separator | |
counter++; | |
line = fgetl(fid); | |
endwhile | |
fclose(fid); | |
end | |
% Get the first 3 movies which are similar to "Queen of Katwe". | |
data = load_data(filename, size(model, 1)); | |
data = data(2:end); | |
selected_rows = data(selected_by_K(:,1)); | |
selected_rows |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment