rohan-paul/RandomizedSearchFromScratch.py

## RandomizedSearchFromScratch.py
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import  tqdm
import random
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


x, y = make_classification(n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=60)

X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42)

def randomized_search_cv_custom(x_train_total, y_train_total, classifier, param_range, num_of_total_fold):
    # x_train_total: its numpy array of shape, (n,d)
    # y_train_total: its numpy array of shape, (n,) or (n,1)
    # classifier: its typically KNeighborsClassifier()
    # param_range: Integer representing how many hyper-parameters I am considering for each iteration
    # num_of_total_fold: an integer, represents number of num_of_total_fold we need to devide the data and test our model

    # generating hyper-parameter range
    # generate 10 unique values(uniform random distribution) in the given range - starting 1 to "param_range"
    # ex: if param_range = 50, we need to generate 10 random numbers in range 1 to 50
    ten_random_values_for_param_range = sorted(random.sample(range(1, param_range), 10))

    train_scores = []
    test_scores = []

    classifier_params = { 'n_neighbors': ten_random_values_for_param_range }
    # it will take classifier and set of values for hyper parameters in dict type
    # dict({hyper parmeter: [list of values]})
    # as we are implementing this only for KNN, the hyper parameter should be n_neighbors
    # And I will return it from the function at the end so that I can use this same hyper-param
    # while plotting the graph as well

    for k in tqdm(classifier_params['n_neighbors']):
        trainscores_folds = []
        testscores_folds = []

        for fold in range(0, num_of_total_fold):
            # divide numbers ranging from  0 to len(x_train_fold) into groups = num_of_total_fold
            # basically, splitting the data into k groups (k = len(x_train_fold) / num_of_total_fold)
            # It works by first training the algorithm on the k_1 group of the data and
            # evaluating it on the kth hold-out group as the test set. This is repeated
            # so that each of the k groups is given an opportunity to be held out and used as the test set.
            # ex: num_of_total_fold=3, and len(x_train_total)=100, we can divide numbers from 0 to 100 into 3 groups
            # group 1: 0-33, group 2:34-66, group 3: 67-100
            num_of_elements_in_each_fold = int(len(x_train_total) / num_of_total_fold)

            # for each hyperparameter that we generated in step 1:
            # and using the above groups we have created in step 2 you will do cross-validation as follows

            # first we will keep group 1+group 2 i.e. 0-66 as train data and
            # group 3: 67-100 as test data, and find train and test accuracies
            # second we will keep group 1+group 3 i.e. 0-33, 67-100 as train data and
            # group 2: 34-66 as test data, and find train and test accuracies
            # third we will keep group 2+group 3 i.e. 34-100 as train data and
            # group 1: 0-33 as test data, and find train and test accuracies
            # based on the 'num_of_total_fold' value we will do the same procedure

            # NOW IMPLEMENTATION OF THE CONCEPT OF 'fold' as below
            # For each of this inner loop running for values of fold (where fold represents num_of_total_fold of 0, 1, 2, 3 ...)
            # each of the test_indices will have the data of a single fold ( which is = num_of_elements_in_each_fold )
            # i.e. the test_indices will be the range starting at
            # num_of_elements_in_each_fold * fold and ending at num_of_elements_in_each_fold * (fold + 1)
            # And this whole range needs to be converted to list => then apply set() to the list
            # => and then again converted to list
            test_indices = list(set(list(range((num_of_elements_in_each_fold * fold), (num_of_elements_in_each_fold*(fold+1))))))
            # print('test_indices ', test_indices)

            # And the rest of the indices of the dataset will be the train_indices
            train_indices = list(set(list(range(0, len(x_train_total)))) - set(test_indices) )
            # print('train_indices ', train_indices)

            ''' So for a dataset of total 100 rows, one loop of fold, will have the following test_indices and train_indices

            test_indices  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]

            train_indices  [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]

            And the next loop of j will have as below (for the same dataset of total 100 rows )

            test_indices  [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]

            train_indices  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]
            '''

            # after we have above, now select datapoints based on test_indices and train_indices
            x_train_fold = x_train_total[train_indices]
            y_train_fold = y_train_total[train_indices]
            x_test_fold = x_train_total[test_indices]
            y_test_fold = y_train_total[test_indices]

            # Now based on our classifier assign corresponding parameter values
            # and also fit() and predict()
            classifier.n_neighbors = k
            classifier.fit(x_train_fold, y_train_fold)

            # First predict based on x_test_fold and keep the accuracy score in the testscores_folds
            y_predicted = classifier.predict(x_test_fold)
            testscores_folds.append(accuracy_score(y_test_fold, y_predicted))

            # Now run prediction based on x_train_fold and append the accuracy score in the trainscores_folds
            y_predicted = classifier.predict(x_train_fold)
            trainscores_folds.append(accuracy_score(y_train_fold, y_predicted))

        train_scores.append(np.mean(np.array(trainscores_folds)))
        test_scores.append(np.mean(np.array(testscores_folds)))

    return train_scores, test_scores, classifier_params


 # Our Classifier is KNN, hence assign a variable to it.
neigh = KNeighborsClassifier()
params_range = 50
number_of_total_folds = 3

# Now invoking our custom function randomized_search_cv_custom(x_train,y_train,classifier, param_range, num_of_total_fold) and store the returned values
testscores, trainscores, params = randomized_search_cv_custom(X_train, y_train, neigh, params_range, number_of_total_folds)
	from sklearn.datasets import make_classification
	from sklearn.model_selection import train_test_split
	import numpy as np
	from tqdm import tqdm
	import random
	from sklearn.metrics import accuracy_score
	from sklearn.neighbors import KNeighborsClassifier
	from matplotlib.colors import ListedColormap
	import matplotlib.pyplot as plt
	import warnings
	warnings.filterwarnings("ignore")


	x, y = make_classification(n_samples=10000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=60)

	X_train, X_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42)

	def randomized_search_cv_custom(x_train_total, y_train_total, classifier, param_range, num_of_total_fold):
	# x_train_total: its numpy array of shape, (n,d)
	# y_train_total: its numpy array of shape, (n,) or (n,1)
	# classifier: its typically KNeighborsClassifier()
	# param_range: Integer representing how many hyper-parameters I am considering for each iteration
	# num_of_total_fold: an integer, represents number of num_of_total_fold we need to devide the data and test our model

	# generating hyper-parameter range
	# generate 10 unique values(uniform random distribution) in the given range - starting 1 to "param_range"
	# ex: if param_range = 50, we need to generate 10 random numbers in range 1 to 50
	ten_random_values_for_param_range = sorted(random.sample(range(1, param_range), 10))

	train_scores = []
	test_scores = []

	classifier_params = { 'n_neighbors': ten_random_values_for_param_range }
	# it will take classifier and set of values for hyper parameters in dict type
	# dict({hyper parmeter: [list of values]})
	# as we are implementing this only for KNN, the hyper parameter should be n_neighbors
	# And I will return it from the function at the end so that I can use this same hyper-param
	# while plotting the graph as well

	for k in tqdm(classifier_params['n_neighbors']):
	trainscores_folds = []
	testscores_folds = []

	for fold in range(0, num_of_total_fold):
	# divide numbers ranging from 0 to len(x_train_fold) into groups = num_of_total_fold
	# basically, splitting the data into k groups (k = len(x_train_fold) / num_of_total_fold)
	# It works by first training the algorithm on the k_1 group of the data and
	# evaluating it on the kth hold-out group as the test set. This is repeated
	# so that each of the k groups is given an opportunity to be held out and used as the test set.
	# ex: num_of_total_fold=3, and len(x_train_total)=100, we can divide numbers from 0 to 100 into 3 groups
	# group 1: 0-33, group 2:34-66, group 3: 67-100
	num_of_elements_in_each_fold = int(len(x_train_total) / num_of_total_fold)

	# for each hyperparameter that we generated in step 1:
	# and using the above groups we have created in step 2 you will do cross-validation as follows

	# first we will keep group 1+group 2 i.e. 0-66 as train data and
	# group 3: 67-100 as test data, and find train and test accuracies
	# second we will keep group 1+group 3 i.e. 0-33, 67-100 as train data and
	# group 2: 34-66 as test data, and find train and test accuracies
	# third we will keep group 2+group 3 i.e. 34-100 as train data and
	# group 1: 0-33 as test data, and find train and test accuracies
	# based on the 'num_of_total_fold' value we will do the same procedure

	# NOW IMPLEMENTATION OF THE CONCEPT OF 'fold' as below
	# For each of this inner loop running for values of fold (where fold represents num_of_total_fold of 0, 1, 2, 3 ...)
	# each of the test_indices will have the data of a single fold ( which is = num_of_elements_in_each_fold )
	# i.e. the test_indices will be the range starting at
	# num_of_elements_in_each_fold * fold and ending at num_of_elements_in_each_fold * (fold + 1)
	# And this whole range needs to be converted to list => then apply set() to the list
	# => and then again converted to list
	test_indices = list(set(list(range((num_of_elements_in_each_fold * fold), (num_of_elements_in_each_fold*(fold+1))))))
	# print('test_indices ', test_indices)

	# And the rest of the indices of the dataset will be the train_indices
	train_indices = list(set(list(range(0, len(x_train_total)))) - set(test_indices) )
	# print('train_indices ', train_indices)

	''' So for a dataset of total 100 rows, one loop of fold, will have the following test_indices and train_indices

	test_indices [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]

	train_indices [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]

	And the next loop of j will have as below (for the same dataset of total 100 rows )

	test_indices [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]

	train_indices [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]
	'''

	# after we have above, now select datapoints based on test_indices and train_indices
	x_train_fold = x_train_total[train_indices]
	y_train_fold = y_train_total[train_indices]
	x_test_fold = x_train_total[test_indices]
	y_test_fold = y_train_total[test_indices]

	# Now based on our classifier assign corresponding parameter values
	# and also fit() and predict()
	classifier.n_neighbors = k
	classifier.fit(x_train_fold, y_train_fold)

	# First predict based on x_test_fold and keep the accuracy score in the testscores_folds
	y_predicted = classifier.predict(x_test_fold)
	testscores_folds.append(accuracy_score(y_test_fold, y_predicted))

	# Now run prediction based on x_train_fold and append the accuracy score in the trainscores_folds
	y_predicted = classifier.predict(x_train_fold)
	trainscores_folds.append(accuracy_score(y_train_fold, y_predicted))

	train_scores.append(np.mean(np.array(trainscores_folds)))
	test_scores.append(np.mean(np.array(testscores_folds)))

	return train_scores, test_scores, classifier_params


	# Our Classifier is KNN, hence assign a variable to it.
	neigh = KNeighborsClassifier()
	params_range = 50
	number_of_total_folds = 3

	# Now invoking our custom function randomized_search_cv_custom(x_train,y_train,classifier, param_range, num_of_total_fold) and store the returned values
	testscores, trainscores, params = randomized_search_cv_custom(X_train, y_train, neigh, params_range, number_of_total_folds)