Created
October 5, 2021 15:21
-
-
Save rohan-paul/a953fafd3011761e48dfaf6a17ec8d74 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets import make_classification | |
from sklearn.model_selection import train_test_split | |
import numpy as np | |
from tqdm import tqdm | |
import random | |
from sklearn.metrics import accuracy_score | |
from sklearn.neighbors import KNeighborsClassifier | |
from matplotlib.colors import ListedColormap | |
import matplotlib.pyplot as plt | |
import warnings | |
warnings.filterwarnings("ignore") | |
x, y = make_classification(n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=60) | |
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42) | |
def randomized_search_cv_custom(x_train_total, y_train_total, classifier, param_range, num_of_total_fold): | |
# x_train_total: its numpy array of shape, (n,d) | |
# y_train_total: its numpy array of shape, (n,) or (n,1) | |
# classifier: its typically KNeighborsClassifier() | |
# param_range: Integer representing how many hyper-parameters I am considering for each iteration | |
# num_of_total_fold: an integer, represents number of num_of_total_fold we need to devide the data and test our model | |
# generating hyper-parameter range | |
# generate 10 unique values(uniform random distribution) in the given range - starting 1 to "param_range" | |
# ex: if param_range = 50, we need to generate 10 random numbers in range 1 to 50 | |
ten_random_values_for_param_range = sorted(random.sample(range(1, param_range), 10)) | |
train_scores = [] | |
test_scores = [] | |
classifier_params = { 'n_neighbors': ten_random_values_for_param_range } | |
# it will take classifier and set of values for hyper parameters in dict type | |
# dict({hyper parmeter: [list of values]}) | |
# as we are implementing this only for KNN, the hyper parameter should be n_neighbors | |
# And I will return it from the function at the end so that I can use this same hyper-param | |
# while plotting the graph as well | |
for k in tqdm(classifier_params['n_neighbors']): | |
trainscores_folds = [] | |
testscores_folds = [] | |
for fold in range(0, num_of_total_fold): | |
# divide numbers ranging from 0 to len(x_train_fold) into groups = num_of_total_fold | |
# basically, splitting the data into k groups (k = len(x_train_fold) / num_of_total_fold) | |
# It works by first training the algorithm on the k_1 group of the data and | |
# evaluating it on the kth hold-out group as the test set. This is repeated | |
# so that each of the k groups is given an opportunity to be held out and used as the test set. | |
# ex: num_of_total_fold=3, and len(x_train_total)=100, we can divide numbers from 0 to 100 into 3 groups | |
# group 1: 0-33, group 2:34-66, group 3: 67-100 | |
num_of_elements_in_each_fold = int(len(x_train_total) / num_of_total_fold) | |
# for each hyperparameter that we generated in step 1: | |
# and using the above groups we have created in step 2 you will do cross-validation as follows | |
# first we will keep group 1+group 2 i.e. 0-66 as train data and | |
# group 3: 67-100 as test data, and find train and test accuracies | |
# second we will keep group 1+group 3 i.e. 0-33, 67-100 as train data and | |
# group 2: 34-66 as test data, and find train and test accuracies | |
# third we will keep group 2+group 3 i.e. 34-100 as train data and | |
# group 1: 0-33 as test data, and find train and test accuracies | |
# based on the 'num_of_total_fold' value we will do the same procedure | |
# NOW IMPLEMENTATION OF THE CONCEPT OF 'fold' as below | |
# For each of this inner loop running for values of fold (where fold represents num_of_total_fold of 0, 1, 2, 3 ...) | |
# each of the test_indices will have the data of a single fold ( which is = num_of_elements_in_each_fold ) | |
# i.e. the test_indices will be the range starting at | |
# num_of_elements_in_each_fold * fold and ending at num_of_elements_in_each_fold * (fold + 1) | |
# And this whole range needs to be converted to list => then apply set() to the list | |
# => and then again converted to list | |
test_indices = list(set(list(range((num_of_elements_in_each_fold * fold), (num_of_elements_in_each_fold*(fold+1)))))) | |
# print('test_indices ', test_indices) | |
# And the rest of the indices of the dataset will be the train_indices | |
train_indices = list(set(list(range(0, len(x_train_total)))) - set(test_indices) ) | |
# print('train_indices ', train_indices) | |
''' So for a dataset of total 100 rows, one loop of fold, will have the following test_indices and train_indices | |
test_indices [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] | |
train_indices [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74] | |
And the next loop of j will have as below (for the same dataset of total 100 rows ) | |
test_indices [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] | |
train_indices [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74] | |
''' | |
# after we have above, now select datapoints based on test_indices and train_indices | |
x_train_fold = x_train_total[train_indices] | |
y_train_fold = y_train_total[train_indices] | |
x_test_fold = x_train_total[test_indices] | |
y_test_fold = y_train_total[test_indices] | |
# Now based on our classifier assign corresponding parameter values | |
# and also fit() and predict() | |
classifier.n_neighbors = k | |
classifier.fit(x_train_fold, y_train_fold) | |
# First predict based on x_test_fold and keep the accuracy score in the testscores_folds | |
y_predicted = classifier.predict(x_test_fold) | |
testscores_folds.append(accuracy_score(y_test_fold, y_predicted)) | |
# Now run prediction based on x_train_fold and append the accuracy score in the trainscores_folds | |
y_predicted = classifier.predict(x_train_fold) | |
trainscores_folds.append(accuracy_score(y_train_fold, y_predicted)) | |
train_scores.append(np.mean(np.array(trainscores_folds))) | |
test_scores.append(np.mean(np.array(testscores_folds))) | |
return train_scores, test_scores, classifier_params | |
# Our Classifier is KNN, hence assign a variable to it. | |
neigh = KNeighborsClassifier() | |
params_range = 50 | |
number_of_total_folds = 3 | |
# Now invoking our custom function randomized_search_cv_custom(x_train,y_train,classifier, param_range, num_of_total_fold) and store the returned values | |
testscores, trainscores, params = randomized_search_cv_custom(X_train, y_train, neigh, params_range, number_of_total_folds) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment