Created
March 12, 2020 13:03
-
-
Save mateuszbaran/89307c254074306e93504a138b43990e to your computer and use it in GitHub Desktop.
Laboratorium 1 rozwiązania
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "zajecia1_rozw.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"toc_visible": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "rnVj7UCQY2ap", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Wstęp\n", | |
"\n", | |
"Wykonaj i przeanalizuj poniższy skrypt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "rZN3BAlJTLnk", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from sklearn import datasets\n", | |
"import matplotlib.pyplot as plt\n", | |
"import random\n", | |
"import numpy as np\n", | |
"\n", | |
"iris = datasets.load_iris()\n", | |
"digits = datasets.load_digits()\n", | |
"diabetes = datasets.load_diabetes()\n", | |
"\n", | |
"print(\"iris: \", iris.data.shape) # macierz rozmiaru (liczba próbek, liczba cech)\n", | |
"print(\"digits: \", digits.data.shape)\n", | |
"print(\"diabetes: \", diabetes.data.shape)\n", | |
"\n", | |
"print(iris.target)\n", | |
"\n", | |
"digit_idx = 43\n", | |
"\n", | |
"print(\"Etykieta cyfry: \", digits.target[digit_idx])\n", | |
"\n", | |
"# wyświetlenie przykładowego obrazu\n", | |
"plt.figure(1, figsize=(3, 3))\n", | |
"plt.imshow(digits.images[digit_idx], cmap=plt.cm.gray_r, interpolation='nearest')\n", | |
"plt.show()\n", | |
"\n", | |
"def disturb_data(X, sigma):\n", | |
" return X + sigma * np.random.randn(*(X.shape))\n", | |
"\n", | |
"plt.figure(1, figsize=(3, 3))\n", | |
"iris_reduced_data = disturb_data(iris.data[:, [1,3]], 0.3)\n", | |
"for d_range in [range(0, 49), range(50, 99), range(100, 149)]:\n", | |
" plt.scatter(iris_reduced_data[d_range, 0], iris_reduced_data[d_range, 1])\n", | |
"plt.show()\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "I7F5jc0_X6-O", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Zadanie 1\n", | |
"\n", | |
"Policz ile jest próbek w poszczególnych klasach" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Xd6rlS8hWlHG", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"dataset = digits\n", | |
"counts = dict()\n", | |
"for cls in dataset.target:\n", | |
" if cls in counts:\n", | |
" counts[cls] += 1\n", | |
" else:\n", | |
" counts[cls] = 1\n", | |
"\n", | |
"print(counts)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Zclzut-aaT-N", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Zadanie 2\n", | |
"\n", | |
"Podziel zbiór danych `iris_reduced_data` na dwie części: część treningowa (po 40 osobników z każdej klasy) i testowa (po 10 osobników każdej klasy)\n", | |
"Naucz klasyfikator $k$NN ($k$ Nearest Neighbors) klasyfikować dane z zestawu `iris_reduced_data` (funkcja `fit` korzystająca z części treningowej.\n", | |
"Policz (korzystając z części testowej):\n", | |
"\n", | |
"* Dokładność (accuracy)\n", | |
"* Czułość (sensitivity, true positive rate, TPR)\n", | |
"* Swoistość (specificity, true negative rate, TNR)\n", | |
"\n", | |
"Jak zmienią się te wskaźniki gdy weźmiemy tylko po 30 osobników z każdej klasy do zbioru treningowego?\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "auAzymE1nH7x", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from matplotlib.colors import ListedColormap\n", | |
"import numpy as np\n", | |
"from sklearn import metrics\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from sklearn.neighbors import KNeighborsClassifier\n", | |
"\n", | |
"\n", | |
"neigh = KNeighborsClassifier(n_neighbors=3)\n", | |
"\n", | |
"X_train, X_test, y_train, y_test = train_test_split(\n", | |
" iris_reduced_data, iris.target, test_size=0.50, random_state=42)\n", | |
"neigh.fit(X_train, y_train)\n", | |
"\n", | |
"clf = neigh\n", | |
"X = iris_reduced_data\n", | |
"h = .01 # step size in the mesh\n", | |
"x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n", | |
"y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n", | |
"xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n", | |
" np.arange(y_min, y_max, h))\n", | |
"# Plot the decision boundary. For that, we will assign a color to each\n", | |
"# point in the mesh [x_min, x_max]x[y_min, y_max].\n", | |
"class_number = 0\n", | |
"Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, class_number]\n", | |
"\n", | |
"# Put the result into a color plot\n", | |
"Z = Z.reshape(xx.shape)\n", | |
"ax = plt.subplot()\n", | |
"cm = plt.cm.RdBu\n", | |
"cm_bright = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n", | |
"ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)\n", | |
"\n", | |
"# Plot the training points\n", | |
"ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,\n", | |
" edgecolors='k', alpha=0.4)\n", | |
"# Plot the testing points\n", | |
"ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,\n", | |
" edgecolors='k', alpha=0.9)\n", | |
"\n", | |
"ax.set_xlim(xx.min(), xx.max())\n", | |
"ax.set_ylim(yy.min(), yy.max())\n", | |
"\n", | |
"y_pred = neigh.predict(X_test)\n", | |
"fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=2)\n", | |
"print(\"ACC = \", metrics.accuracy_score(y_test, y_pred))\n", | |
"print(\"AUC = \", metrics.auc(fpr, tpr))\n", | |
"print(metrics.classification_report(y_test, y_pred))\n", | |
"print(metrics.multilabel_confusion_matrix(y_test, y_pred))\n", | |
"metrics.plot_confusion_matrix(clf, X_test, y_test)\n", | |
"\n", | |
"roc_auc = metrics.auc(fpr, tpr)\n", | |
"roc_plot = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=\"kNN\")\n", | |
"roc_plot.plot()\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "RBAXhy35SkIG", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Zadanie 3\n", | |
"\n", | |
"Powtórz zadanie 2, ale dla klasyfikatora SVM. Użyj kerneli `rbf` i `linear`. Porównaj działanie dla wartości parametru `C=1.0, 100.0, 0.01`. Dla kernela `rbf` przetestuj różne opcje skalowania cech (parametr `gamma`: wartosci `scale`, `auto`, `1.0`, `10.0`, `0.1`." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "mqFMWzV0Sw4C", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from sklearn.svm import SVC\n", | |
"clf = SVC(gamma='auto')\n", | |
"for C in [1.0, 100.0, 0.01]:\n", | |
" clf.set_params(C=C, kernel='linear')\n", | |
" clf.fit(X_train, y_train)\n", | |
" y_pred = clf.predict(X_test)\n", | |
" print(\"ACC [linear, C=\", C, \"] = \", metrics.accuracy_score(y_test, y_pred))\n", | |
" clf.kernel = 'rbf'\n", | |
" for gamma in ['scale', 'auto', 1.0, 10.0, 0.1]:\n", | |
" clf.set_params(C=C, kernel='rbf', gamma=gamma)\n", | |
" clf.fit(X_train, y_train)\n", | |
" print(\"ACC [rbf, C=\", C, \", gamma=\", gamma, \"] = \", metrics.accuracy_score(y_test, y_pred))\n", | |
"\n", | |
"\n", | |
"\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "7jT_2gw04SVM", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Zadanie 4\n", | |
"\n", | |
"Znajdź najlepsze (pod względem dokładności) parametry klasyfikatora SVM z użyciem 5-krotnej walidacji krzyżowej: kernel, $C$, wybrany parametr kernela. Przeszukaj przynajmniej 100 zestawów wartości.\n", | |
"\n", | |
"Czy te same parametry zapewniają najlepszą wartość innych metryk?\n", | |
"\n", | |
"Wykorzystaj `from sklearn.model_selection import KFold`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "J4aqbEtE7UAL", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from sklearn.model_selection import KFold\n", | |
"from sklearn.svm import SVC\n", | |
"\n", | |
"def evaluate_classifier(C):\n", | |
" kf = KFold(n_splits=5)\n", | |
" X = iris_reduced_data\n", | |
" y = iris.target\n", | |
" for train_index, test_index in kf.split(X, y):\n", | |
" #print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", | |
" X_train, X_test = X[train_index], X[test_index]\n", | |
" y_train, y_test = y[train_index], y[test_index]\n", | |
"\n", | |
" clf = SVC(gamma='auto', kernel='rbf', C=C)\n", | |
" clf.fit(X_train, y_train)\n", | |
" y_pred = clf.predict(X_test)\n", | |
" print(metrics.accuracy_score(y_test, y_pred))\n", | |
"\n", | |
"\n", | |
"for C in [1.0, 3.0, 0.3]:\n", | |
" print(f\"C = {C}\")\n", | |
" evaluate_classifier(C)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment