Created
May 31, 2018 13:30
-
-
Save greenwolf-nsk/a95938bfcc392e4b9aec80a1583ab6d2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-30T13:37:12.709037Z", | |
"start_time": "2018-05-30T13:37:12.706202Z" | |
} | |
}, | |
"source": [ | |
"# Numpy & Pandas" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:01.944282Z", | |
"start_time": "2018-05-31T13:13:01.486856Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"1. Создайте numpy массив из целых чисел от 1 до 10\n", | |
"2. Создайте numpy массив из целых чисел от 10 до 1\n", | |
"3. Чему равно Евклидово расстояние между векторами?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:01.950726Z", | |
"start_time": "2018-05-31T13:13:01.946256Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"18.16590212458495\n" | |
] | |
} | |
], | |
"source": [ | |
"a = np.arange(1, 11)\n", | |
"b = np.arange(10, 0, -1)\n", | |
"print(np.sqrt(((a - b) ** 2).sum()))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"1. Создайте из массивов pandas DataFrame с двумя колонками\n", | |
"2. При помощи iloc достаньте из DataFrame первую строку. Какой тип объекта получим?\n", | |
"3. При помощи iloc достаньте первые 5 строк из DataFrame. Какой тип объекта получим?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:01.959191Z", | |
"start_time": "2018-05-31T13:13:01.953446Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.series.Series'>\n", | |
"<class 'pandas.core.frame.DataFrame'>\n" | |
] | |
} | |
], | |
"source": [ | |
"df = pd.DataFrame({'a' : a, 'b': b})\n", | |
"first_row = df.iloc[0]\n", | |
"first_rows = df.iloc[:5]\n", | |
"print(type(first_row))\n", | |
"print(type(first_rows))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Classification & Sklearn" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:02.425587Z", | |
"start_time": "2018-05-31T13:13:01.961424Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import sklearn\n", | |
"from sklearn.neighbors import KNeighborsClassifier\n", | |
"from sklearn.model_selection import cross_val_score" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T11:23:26.757656Z", | |
"start_time": "2018-05-31T11:23:26.753335Z" | |
} | |
}, | |
"source": [ | |
"Создадим игрушечные данные с возрастами 5 детей и 5 взрослых" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:02.431995Z", | |
"start_time": "2018-05-31T13:13:02.427871Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"age = [10] * 5 + [40] * 5\n", | |
"simple_train = pd.DataFrame({'age': age})\n", | |
"simple_target = [0] * 5 + [1] * 5" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T11:19:51.828238Z", | |
"start_time": "2018-05-31T11:19:51.824183Z" | |
} | |
}, | |
"source": [ | |
"Обучите классификатор с параметром n_neighbors=1 на игрушечных данных" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:02.446549Z", | |
"start_time": "2018-05-31T13:13:02.433717Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", | |
" metric_params=None, n_jobs=1, n_neighbors=1, p=2,\n", | |
" weights='uniform')" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"knn = KNeighborsClassifier(n_neighbors=1)\n", | |
"knn.fit(simple_train, simple_target)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T11:24:45.057494Z", | |
"start_time": "2018-05-31T11:24:45.052922Z" | |
} | |
}, | |
"source": [ | |
"Реализуйте функцию is_child(age: int), которая при помощи классификатора определяет по возрасту, является ли человек ребенком" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:02.451626Z", | |
"start_time": "2018-05-31T13:13:02.448809Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"def is_child(age: int) -> bool:\n", | |
" score = knn.predict([[age]])[0]\n", | |
" return not bool(score)\n", | |
" \n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:03.050649Z", | |
"start_time": "2018-05-31T13:13:03.045766Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"False" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"is_child(26)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Когда человек перестает быть ребенком (по мнению нашего классификатора)? :)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:03:25.264913Z", | |
"start_time": "2018-05-31T13:03:25.260439Z" | |
} | |
}, | |
"source": [ | |
"В 26" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Data Preparation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:04.120381Z", | |
"start_time": "2018-05-31T13:13:04.117254Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.model_selection import train_test_split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:04.360184Z", | |
"start_time": "2018-05-31T13:13:04.349658Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"data = pd.read_csv('titanic_train.csv')\n", | |
"target = data['Survived']\n", | |
"del data['Survived']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"При помощи функции train_test_split разбейте данные на train и test в соотношении 2 к 1\n", | |
"У вас должно получится 4 массива: X_train, X_test, y_train, y_test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:05.046042Z", | |
"start_time": "2018-05-31T13:13:05.040190Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"X_train, X_test, y_train, y_test = train_test_split(data, target)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Сформируйте два списка \n", | |
"- numeric_columns - с именами численных признаков\n", | |
"- categorical_columns - с именами категориальных признаков" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:05.720404Z", | |
"start_time": "2018-05-31T13:13:05.717325Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"numeric_columns = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch']\n", | |
"categorical_columns = ['Sex', 'Embarked']" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- Выберите из X_train и X_test только численные колонки\n", | |
"- заполните пропуски в данных\n", | |
"- обучите классификатор на X_train и y_train\n", | |
"- проверьте accuracy на X_test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:06.368028Z", | |
"start_time": "2018-05-31T13:13:06.365291Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics import accuracy_score" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:13:06.764125Z", | |
"start_time": "2018-05-31T13:13:06.755224Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.6322869955156951\n" | |
] | |
} | |
], | |
"source": [ | |
"X_train_numeric = X_train[numeric_columns].fillna(0)\n", | |
"X_test_numeric = X_test[numeric_columns].fillna(0)\n", | |
"knn = KNeighborsClassifier(n_neighbors=5)\n", | |
"knn.fit(X_train_numeric, y_train)\n", | |
"pred = knn.predict(X_test_numeric)\n", | |
"print(accuracy_score(pred, y_test))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T11:48:59.291240Z", | |
"start_time": "2018-05-31T11:48:59.285916Z" | |
} | |
}, | |
"source": [ | |
"- Выберите из X_train и X_test категориальные и численные колонки\n", | |
"- преобразуйте категориальные колонки при помощи LabelEncoder\n", | |
"- проверьте accuracy на X_test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:14:39.093807Z", | |
"start_time": "2018-05-31T13:14:39.090793Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.preprocessing import LabelEncoder\n", | |
"from warnings import filterwarnings\n", | |
"filterwarnings('ignore')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:14:48.741481Z", | |
"start_time": "2018-05-31T13:14:48.587277Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.672645739910314\n" | |
] | |
} | |
], | |
"source": [ | |
"X_train_numeric_categorical = X_train[numeric_columns + categorical_columns]\n", | |
"X_test_numeric_categorical = X_test[numeric_columns + categorical_columns]\n", | |
"\n", | |
"for column in categorical_columns:\n", | |
" le = LabelEncoder()\n", | |
" le.fit(data.loc[:, column].fillna(''))\n", | |
" X_train_numeric_categorical[column] = le.transform(X_train_numeric_categorical[column].fillna(''))\n", | |
" X_test_numeric_categorical[column] = le.transform(X_test_numeric_categorical[column].fillna(''))\n", | |
"\n", | |
"X_train_numeric_categorical.fillna(0, inplace=True)\n", | |
"X_test_numeric_categorical.fillna(0, inplace=True)\n", | |
"\n", | |
"knn = KNeighborsClassifier(n_neighbors=5)\n", | |
"knn.fit(X_train_numeric_categorical, y_train)\n", | |
"pred = knn.predict(X_test_numeric_categorical)\n", | |
"print(accuracy_score(pred, y_test))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-30T19:45:42.908083Z", | |
"start_time": "2018-05-30T19:45:42.905275Z" | |
} | |
}, | |
"source": [ | |
"# Data Scaling" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:15:04.964762Z", | |
"start_time": "2018-05-31T13:15:04.961965Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.preprocessing import StandardScaler, MinMaxScaler" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- Преобразуйте train и test при помощи StandardScaler и MinMaxScaler\n", | |
"- Для train используйте метод .fit_transform, а для test - .transform\n", | |
"- Обучите классификатор на полученных данных и проверьте его качество. Изменилось ли оно? В какую сторону и почему?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:15:12.845381Z", | |
"start_time": "2018-05-31T13:15:12.842300Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"ss = StandardScaler()\n", | |
"mms = MinMaxScaler()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:17:00.674723Z", | |
"start_time": "2018-05-31T13:17:00.667066Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.7982062780269058\n" | |
] | |
} | |
], | |
"source": [ | |
"X_train_ss = ss.fit_transform(X_train_numeric_categorical)\n", | |
"X_test_ss = ss.transform(X_test_numeric_categorical)\n", | |
"\n", | |
"knn.fit(X_train_ss, y_train)\n", | |
"pred = knn.predict(X_test_ss)\n", | |
"print(accuracy_score(pred, y_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2018-05-31T13:17:01.077408Z", | |
"start_time": "2018-05-31T13:17:01.068475Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0.757847533632287\n" | |
] | |
} | |
], | |
"source": [ | |
"X_train_mms = mms.fit_transform(X_train_numeric_categorical)\n", | |
"X_test_mms = mms.transform(X_test_numeric_categorical)\n", | |
"\n", | |
"knn.fit(X_train_mms, y_train)\n", | |
"pred = knn.predict(X_test_mms)\n", | |
"print(accuracy_score(pred, y_test))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
}, | |
"toc": { | |
"colors": { | |
"hover_highlight": "#DAA520", | |
"running_highlight": "#FF0000", | |
"selected_highlight": "#FFD700" | |
}, | |
"moveMenuLeft": true, | |
"nav_menu": { | |
"height": "84px", | |
"width": "252px" | |
}, | |
"navigate_menu": true, | |
"number_sections": true, | |
"sideBar": true, | |
"threshold": 4, | |
"toc_cell": false, | |
"toc_section_display": "block", | |
"toc_window_display": true | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment