Created
March 6, 2024 16:32
-
-
Save sallos-cyber/ae2d66ff7d2fe5431531b08c79a5149f to your computer and use it in GitHub Desktop.
Answers to programming questions "probabl"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "f2416d54", | |
"metadata": {}, | |
"source": [ | |
"Solution to the following programming question:\n", | |
"\n", | |
"Using sklearn.model_selection.permutation_test_score, compute the p-value indicating if the score obtained by an instance of sklearn.dummy.DummyClassifier on the dataset sklearn.datasets.load_iris is obtained by chance.\n", | |
"\n", | |
"Repeat this analysis using sklearn.ensemble.HistGradientBoostingClassifier instead of sklearn.dummy.DummyClassifier.\n", | |
"\n", | |
"What can you conclude about:\n", | |
"\n", | |
"the existence of a significant statistical association between the iris type and the input features (petal and sepal width and length)?\n", | |
"the ability of each kind of estimator to assess or not such a statistical association between features and target variable?\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "408bbad8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.ensemble import HistGradientBoostingClassifier\n", | |
"import numpy as np\n", | |
"from sklearn import datasets\n", | |
"import scipy.stats as stats\n", | |
"from sklearn.dummy import DummyClassifier\n", | |
"from sklearn.model_selection import train_test_split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "d87e0650", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def is_obtained_by_chance(nr_classes: int, nr_samples: int, score: float,\n", | |
" alpha: float = .05) -> bool:\n", | |
" \"\"\"Check if the submitted score is just as good or bad as random guessing.\n", | |
"\n", | |
" In case of random guessing the probability of obtaining the correct output\n", | |
" per sample equals 1/nr_classes, thus the proportion of correctly classified\n", | |
" samples (p0) would equal 1/nr_classes.\n", | |
" Because of the law of large number we know that for a large number of\n", | |
" samples the sampling distribution of the sample proportion follows a\n", | |
" normal distribution. However, since the variance must be estimated,\n", | |
" we assume a t-distribution.\n", | |
" Therefore the following hypothesis test (t-test) is conducted:\n", | |
"\n", | |
" Hypothesis 0: proportion = 1/nr_classes\n", | |
" Hypothesis A: proportion != 1/nr_classes\n", | |
"\n", | |
" Thus, under the assumption that the Null hypothesis is correct,\n", | |
" the test-statistic is a t-distribution with mean p0, with standard\n", | |
" deviation equal to the standard error of the mean (sem)\n", | |
" (sample variance/sqrt(nr_samples)) and with degrees of freedom equal to\n", | |
" nr_samples -1.\n", | |
"\n", | |
" Parameters\n", | |
" ----------\n", | |
" nr_classes : int\n", | |
" number of possible output classes for the given classification task\n", | |
" nr_samples : int\n", | |
" number of samples that the score was obtained from\n", | |
" score : float\n", | |
" proportion of correctly classified samples\n", | |
" alpha : float, default = 0.05\n", | |
" -desired significance-level for the test, i.e. 1-confidence-level\n", | |
"\n", | |
" Returns\n", | |
" -------\n", | |
" obtained_by_chance : bool\n", | |
" True, if the model performance is comparable to random guessing, i.e. H0 was not rejected\n", | |
" else False\n", | |
" \"\"\"\n", | |
" obtained_by_chance = True\n", | |
"\n", | |
" p0 = 1/nr_classes\n", | |
" sem = np.sqrt(p0 * (1-p0) / nr_samples)\n", | |
" p_value = stats.t(df=nr_samples-1, loc=p0, scale=sem).cdf(score)\n", | |
" # model performs better than chance -> reject H0\n", | |
" if p_value < alpha/2:\n", | |
" obtained_by_chance = False\n", | |
" return obtained_by_chance\n", | |
" # model performs worse than chance -> reject H0\n", | |
" if (1-p_value) < alpha/2:\n", | |
" obtained_by_chance = False\n", | |
" return obtained_by_chance\n", | |
"\n", | |
" return obtained_by_chance" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e34b21d6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# load the data, train model on training set and apply to test set\n", | |
"X, y = datasets.load_iris(return_X_y=True)\n", | |
"X_train, X_test, y_train, y_test = train_test_split(\n", | |
" X, y,\n", | |
" test_size=0.4,\n", | |
" random_state=0)\n", | |
"\n", | |
"dummy_clf = DummyClassifier()\n", | |
"dummy_clf.fit(X_train, y_train)\n", | |
"score = dummy_clf.score(X_test, y_test)\n", | |
"print('The assumption that the DummyClassifier performance is not different from random guessing is',\n", | |
" is_obtained_by_chance(3, len(y_test), score))\n", | |
"\n", | |
"# now repeat with a different classifier\n", | |
"model = HistGradientBoostingClassifier(max_bins=255, max_iter=100)\n", | |
"model.fit(X_train, y_train)\n", | |
"prediction_boosting = model.predict(X_test)\n", | |
"score = model.score(X_test, y_test)\n", | |
"print('The assumption that the HistGradientBoostingClassifier performance is not different from random guessing is',\n", | |
" is_obtained_by_chance(3, len(y_test), score))\n" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.4" | |
}, | |
"varInspector": { | |
"cols": { | |
"lenName": 16, | |
"lenType": 16, | |
"lenVar": 40 | |
}, | |
"kernels_config": { | |
"python": { | |
"delete_cmd_postfix": "", | |
"delete_cmd_prefix": "del ", | |
"library": "var_list.py", | |
"varRefreshCmd": "print(var_dic_list())" | |
}, | |
"r": { | |
"delete_cmd_postfix": ") ", | |
"delete_cmd_prefix": "rm(", | |
"library": "var_list.r", | |
"varRefreshCmd": "cat(var_dic_list()) " | |
} | |
}, | |
"types_to_exclude": [ | |
"module", | |
"function", | |
"builtin_function_or_method", | |
"instance", | |
"_Feature" | |
], | |
"window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment