Skip to content

Instantly share code, notes, and snippets.

@rohan-paul
Created October 5, 2021 15:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rohan-paul/a953fafd3011761e48dfaf6a17ec8d74 to your computer and use it in GitHub Desktop.
Save rohan-paul/a953fafd3011761e48dfaf6a17ec8d74 to your computer and use it in GitHub Desktop.
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import random
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
x, y = make_classification(n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=60)
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42)
def randomized_search_cv_custom(x_train_total, y_train_total, classifier, param_range, num_of_total_fold):
# x_train_total: its numpy array of shape, (n,d)
# y_train_total: its numpy array of shape, (n,) or (n,1)
# classifier: its typically KNeighborsClassifier()
# param_range: Integer representing how many hyper-parameters I am considering for each iteration
# num_of_total_fold: an integer, represents number of num_of_total_fold we need to devide the data and test our model
# generating hyper-parameter range
# generate 10 unique values(uniform random distribution) in the given range - starting 1 to "param_range"
# ex: if param_range = 50, we need to generate 10 random numbers in range 1 to 50
ten_random_values_for_param_range = sorted(random.sample(range(1, param_range), 10))
train_scores = []
test_scores = []
classifier_params = { 'n_neighbors': ten_random_values_for_param_range }
# it will take classifier and set of values for hyper parameters in dict type
# dict({hyper parmeter: [list of values]})
# as we are implementing this only for KNN, the hyper parameter should be n_neighbors
# And I will return it from the function at the end so that I can use this same hyper-param
# while plotting the graph as well
for k in tqdm(classifier_params['n_neighbors']):
trainscores_folds = []
testscores_folds = []
for fold in range(0, num_of_total_fold):
# divide numbers ranging from 0 to len(x_train_fold) into groups = num_of_total_fold
# basically, splitting the data into k groups (k = len(x_train_fold) / num_of_total_fold)
# It works by first training the algorithm on the k_1 group of the data and
# evaluating it on the kth hold-out group as the test set. This is repeated
# so that each of the k groups is given an opportunity to be held out and used as the test set.
# ex: num_of_total_fold=3, and len(x_train_total)=100, we can divide numbers from 0 to 100 into 3 groups
# group 1: 0-33, group 2:34-66, group 3: 67-100
num_of_elements_in_each_fold = int(len(x_train_total) / num_of_total_fold)
# for each hyperparameter that we generated in step 1:
# and using the above groups we have created in step 2 you will do cross-validation as follows
# first we will keep group 1+group 2 i.e. 0-66 as train data and
# group 3: 67-100 as test data, and find train and test accuracies
# second we will keep group 1+group 3 i.e. 0-33, 67-100 as train data and
# group 2: 34-66 as test data, and find train and test accuracies
# third we will keep group 2+group 3 i.e. 34-100 as train data and
# group 1: 0-33 as test data, and find train and test accuracies
# based on the 'num_of_total_fold' value we will do the same procedure
# NOW IMPLEMENTATION OF THE CONCEPT OF 'fold' as below
# For each of this inner loop running for values of fold (where fold represents num_of_total_fold of 0, 1, 2, 3 ...)
# each of the test_indices will have the data of a single fold ( which is = num_of_elements_in_each_fold )
# i.e. the test_indices will be the range starting at
# num_of_elements_in_each_fold * fold and ending at num_of_elements_in_each_fold * (fold + 1)
# And this whole range needs to be converted to list => then apply set() to the list
# => and then again converted to list
test_indices = list(set(list(range((num_of_elements_in_each_fold * fold), (num_of_elements_in_each_fold*(fold+1))))))
# print('test_indices ', test_indices)
# And the rest of the indices of the dataset will be the train_indices
train_indices = list(set(list(range(0, len(x_train_total)))) - set(test_indices) )
# print('train_indices ', train_indices)
''' So for a dataset of total 100 rows, one loop of fold, will have the following test_indices and train_indices
test_indices [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
train_indices [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]
And the next loop of j will have as below (for the same dataset of total 100 rows )
test_indices [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
train_indices [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]
'''
# after we have above, now select datapoints based on test_indices and train_indices
x_train_fold = x_train_total[train_indices]
y_train_fold = y_train_total[train_indices]
x_test_fold = x_train_total[test_indices]
y_test_fold = y_train_total[test_indices]
# Now based on our classifier assign corresponding parameter values
# and also fit() and predict()
classifier.n_neighbors = k
classifier.fit(x_train_fold, y_train_fold)
# First predict based on x_test_fold and keep the accuracy score in the testscores_folds
y_predicted = classifier.predict(x_test_fold)
testscores_folds.append(accuracy_score(y_test_fold, y_predicted))
# Now run prediction based on x_train_fold and append the accuracy score in the trainscores_folds
y_predicted = classifier.predict(x_train_fold)
trainscores_folds.append(accuracy_score(y_train_fold, y_predicted))
train_scores.append(np.mean(np.array(trainscores_folds)))
test_scores.append(np.mean(np.array(testscores_folds)))
return train_scores, test_scores, classifier_params
# Our Classifier is KNN, hence assign a variable to it.
neigh = KNeighborsClassifier()
params_range = 50
number_of_total_folds = 3
# Now invoking our custom function randomized_search_cv_custom(x_train,y_train,classifier, param_range, num_of_total_fold) and store the returned values
testscores, trainscores, params = randomized_search_cv_custom(X_train, y_train, neigh, params_range, number_of_total_folds)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment