sallos-cyber/anwers_to_programming_questions_markelic.ipynb

## anwers_to_programming_questions_markelic.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f2416d54",
   "metadata": {},
   "source": [
    "Solution to the following programming question:\n",
    "\n",
    "Using sklearn.model_selection.permutation_test_score, compute the p-value indicating if the score obtained by an instance of sklearn.dummy.DummyClassifier on the dataset sklearn.datasets.load_iris is obtained by chance.\n",
    "\n",
    "Repeat this analysis using sklearn.ensemble.HistGradientBoostingClassifier instead of sklearn.dummy.DummyClassifier.\n",
    "\n",
    "What can you conclude about:\n",
    "\n",
    "the existence of a significant statistical association between the iris type and the input features (petal and sepal width and length)?\n",
    "the ability of each kind of estimator to assess or not such a statistical association between features and target variable?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "408bbad8",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import HistGradientBoostingClassifier\n",
    "import numpy as np\n",
    "from sklearn import datasets\n",
    "import scipy.stats as stats\n",
    "from sklearn.dummy import DummyClassifier\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d87e0650",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_obtained_by_chance(nr_classes: int, nr_samples: int, score: float,\n",
    "                          alpha: float = .05) -> bool:\n",
    "    \"\"\"Check if the submitted score is just as good or bad as random guessing.\n",
    "\n",
    "    In case of random guessing the probability of obtaining the correct output\n",
    "    per sample equals 1/nr_classes, thus the proportion of correctly classified\n",
    "    samples (p0) would equal 1/nr_classes.\n",
    "    Because of the law of large number we know that for a large number of\n",
    "    samples the sampling distribution of the sample proportion follows a\n",
    "    normal distribution. However, since the variance must be estimated,\n",
    "    we assume a t-distribution.\n",
    "    Therefore the following hypothesis test (t-test) is conducted:\n",
    "\n",
    "    Hypothesis 0: proportion = 1/nr_classes\n",
    "    Hypothesis A: proportion != 1/nr_classes\n",
    "\n",
    "    Thus, under the assumption that the Null hypothesis is correct,\n",
    "    the test-statistic is a t-distribution with mean p0, with standard\n",
    "    deviation equal to the standard error of the mean (sem)\n",
    "    (sample variance/sqrt(nr_samples)) and with degrees of freedom equal to\n",
    "    nr_samples -1.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    nr_classes : int\n",
    "        number of possible output classes for the given classification task\n",
    "    nr_samples : int\n",
    "        number of samples that the score was obtained from\n",
    "    score : float\n",
    "       proportion of correctly classified samples\n",
    "    alpha : float, default = 0.05\n",
    "      -desired significance-level for the test, i.e. 1-confidence-level\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    obtained_by_chance : bool\n",
    "        True, if the model performance is comparable to random guessing, i.e. H0 was not rejected\n",
    "        else False\n",
    "    \"\"\"\n",
    "    obtained_by_chance = True\n",
    "\n",
    "    p0 = 1/nr_classes\n",
    "    sem = np.sqrt(p0 * (1-p0) / nr_samples)\n",
    "    p_value = stats.t(df=nr_samples-1, loc=p0, scale=sem).cdf(score)\n",
    "    # model performs better than chance -> reject H0\n",
    "    if p_value < alpha/2:\n",
    "        obtained_by_chance = False\n",
    "        return obtained_by_chance\n",
    "    # model performs worse than chance -> reject H0\n",
    "    if (1-p_value) < alpha/2:\n",
    "        obtained_by_chance = False\n",
    "        return obtained_by_chance\n",
    "\n",
    "    return obtained_by_chance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e34b21d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the data, train model on training set and apply to test set\n",
    "X, y = datasets.load_iris(return_X_y=True)\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y,\n",
    "    test_size=0.4,\n",
    "    random_state=0)\n",
    "\n",
    "dummy_clf = DummyClassifier()\n",
    "dummy_clf.fit(X_train, y_train)\n",
    "score = dummy_clf.score(X_test, y_test)\n",
    "print('The assumption that the DummyClassifier performance is not different from random guessing is',\n",
    "      is_obtained_by_chance(3, len(y_test), score))\n",
    "\n",
    "# now repeat with a different classifier\n",
    "model = HistGradientBoostingClassifier(max_bins=255, max_iter=100)\n",
    "model.fit(X_train, y_train)\n",
    "prediction_boosting = model.predict(X_test)\n",
    "score = model.score(X_test, y_test)\n",
    "print('The assumption that the HistGradientBoostingClassifier performance is not different from random guessing is',\n",
    "      is_obtained_by_chance(3, len(y_test), score))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "f2416d54",
	"metadata": {},
	"source": [
	"Solution to the following programming question:\n",
	"\n",
	"Using sklearn.model_selection.permutation_test_score, compute the p-value indicating if the score obtained by an instance of sklearn.dummy.DummyClassifier on the dataset sklearn.datasets.load_iris is obtained by chance.\n",
	"\n",
	"Repeat this analysis using sklearn.ensemble.HistGradientBoostingClassifier instead of sklearn.dummy.DummyClassifier.\n",
	"\n",
	"What can you conclude about:\n",
	"\n",
	"the existence of a significant statistical association between the iris type and the input features (petal and sepal width and length)?\n",
	"the ability of each kind of estimator to assess or not such a statistical association between features and target variable?\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "408bbad8",
	"metadata": {},
	"outputs": [],
	"source": [
	"from sklearn.ensemble import HistGradientBoostingClassifier\n",
	"import numpy as np\n",
	"from sklearn import datasets\n",
	"import scipy.stats as stats\n",
	"from sklearn.dummy import DummyClassifier\n",
	"from sklearn.model_selection import train_test_split"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "d87e0650",
	"metadata": {},
	"outputs": [],
	"source": [
	"def is_obtained_by_chance(nr_classes: int, nr_samples: int, score: float,\n",
	" alpha: float = .05) -> bool:\n",
	" \"\"\"Check if the submitted score is just as good or bad as random guessing.\n",
	"\n",
	" In case of random guessing the probability of obtaining the correct output\n",
	" per sample equals 1/nr_classes, thus the proportion of correctly classified\n",
	" samples (p0) would equal 1/nr_classes.\n",
	" Because of the law of large number we know that for a large number of\n",
	" samples the sampling distribution of the sample proportion follows a\n",
	" normal distribution. However, since the variance must be estimated,\n",
	" we assume a t-distribution.\n",
	" Therefore the following hypothesis test (t-test) is conducted:\n",
	"\n",
	" Hypothesis 0: proportion = 1/nr_classes\n",
	" Hypothesis A: proportion != 1/nr_classes\n",
	"\n",
	" Thus, under the assumption that the Null hypothesis is correct,\n",
	" the test-statistic is a t-distribution with mean p0, with standard\n",
	" deviation equal to the standard error of the mean (sem)\n",
	" (sample variance/sqrt(nr_samples)) and with degrees of freedom equal to\n",
	" nr_samples -1.\n",
	"\n",
	" Parameters\n",
	" ----------\n",
	" nr_classes : int\n",
	" number of possible output classes for the given classification task\n",
	" nr_samples : int\n",
	" number of samples that the score was obtained from\n",
	" score : float\n",
	" proportion of correctly classified samples\n",
	" alpha : float, default = 0.05\n",
	" -desired significance-level for the test, i.e. 1-confidence-level\n",
	"\n",
	" Returns\n",
	" -------\n",
	" obtained_by_chance : bool\n",
	" True, if the model performance is comparable to random guessing, i.e. H0 was not rejected\n",
	" else False\n",
	" \"\"\"\n",
	" obtained_by_chance = True\n",
	"\n",
	" p0 = 1/nr_classes\n",
	" sem = np.sqrt(p0 * (1-p0) / nr_samples)\n",
	" p_value = stats.t(df=nr_samples-1, loc=p0, scale=sem).cdf(score)\n",
	" # model performs better than chance -> reject H0\n",
	" if p_value < alpha/2:\n",
	" obtained_by_chance = False\n",
	" return obtained_by_chance\n",
	" # model performs worse than chance -> reject H0\n",
	" if (1-p_value) < alpha/2:\n",
	" obtained_by_chance = False\n",
	" return obtained_by_chance\n",
	"\n",
	" return obtained_by_chance"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "e34b21d6",
	"metadata": {},
	"outputs": [],
	"source": [
	"# load the data, train model on training set and apply to test set\n",
	"X, y = datasets.load_iris(return_X_y=True)\n",
	"X_train, X_test, y_train, y_test = train_test_split(\n",
	" X, y,\n",
	" test_size=0.4,\n",
	" random_state=0)\n",
	"\n",
	"dummy_clf = DummyClassifier()\n",
	"dummy_clf.fit(X_train, y_train)\n",
	"score = dummy_clf.score(X_test, y_test)\n",
	"print('The assumption that the DummyClassifier performance is not different from random guessing is',\n",
	" is_obtained_by_chance(3, len(y_test), score))\n",
	"\n",
	"# now repeat with a different classifier\n",
	"model = HistGradientBoostingClassifier(max_bins=255, max_iter=100)\n",
	"model.fit(X_train, y_train)\n",
	"prediction_boosting = model.predict(X_test)\n",
	"score = model.score(X_test, y_test)\n",
	"print('The assumption that the HistGradientBoostingClassifier performance is not different from random guessing is',\n",
	" is_obtained_by_chance(3, len(y_test), score))\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.11.4"
	},
	"varInspector": {
	"cols": {
	"lenName": 16,
	"lenType": 16,
	"lenVar": 40
	},
	"kernels_config": {
	"python": {
	"delete_cmd_postfix": "",
	"delete_cmd_prefix": "del ",
	"library": "var_list.py",
	"varRefreshCmd": "print(var_dic_list())"
	},
	"r": {
	"delete_cmd_postfix": ") ",
	"delete_cmd_prefix": "rm(",
	"library": "var_list.r",
	"varRefreshCmd": "cat(var_dic_list()) "
	}
	},
	"types_to_exclude": [
	"module",
	"function",
	"builtin_function_or_method",
	"instance",
	"_Feature"
	],
	"window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}