-
-
Save justmarkham/07715a310360e466704508049571c826 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Office Hours session 1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Recap of Lesson 1 (copy from here: http://bit.ly/first-ml-lesson)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from sklearn.preprocessing import OneHotEncoder\n", | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"from sklearn.linear_model import LogisticRegression\n", | |
"from sklearn.compose import make_column_transformer\n", | |
"from sklearn.pipeline import make_pipeline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cols = ['Parch', 'Fare', 'Embarked', 'Sex', 'Name']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv('http://bit.ly/kaggletrain', nrows=10)\n", | |
"X = df[cols]\n", | |
"y = df['Survived']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_new = pd.read_csv('http://bit.ly/kaggletest', nrows=10)\n", | |
"X_new = df_new[cols]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ohe = OneHotEncoder()\n", | |
"vect = CountVectorizer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (ohe, ['Embarked', 'Sex']),\n", | |
" (vect, 'Name'),\n", | |
" remainder='passthrough')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"logreg = LogisticRegression(solver='liblinear', random_state=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe = make_pipeline(ct, logreg)\n", | |
"pipe.fit(X, y)\n", | |
"pipe.predict(X_new)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Anna: Why is it important to select a Series instead of a DataFrame for the target variable?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Braund, Mr. Owen Harris</td>\n", | |
" <td>male</td>\n", | |
" <td>22.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>A/5 21171</td>\n", | |
" <td>7.2500</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", | |
" <td>female</td>\n", | |
" <td>38.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>PC 17599</td>\n", | |
" <td>71.2833</td>\n", | |
" <td>C85</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Heikkinen, Miss. Laina</td>\n", | |
" <td>female</td>\n", | |
" <td>26.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>STON/O2. 3101282</td>\n", | |
" <td>7.9250</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", | |
" <td>female</td>\n", | |
" <td>35.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>113803</td>\n", | |
" <td>53.1000</td>\n", | |
" <td>C123</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Allen, Mr. William Henry</td>\n", | |
" <td>male</td>\n", | |
" <td>35.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>373450</td>\n", | |
" <td>8.0500</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>6</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Moran, Mr. James</td>\n", | |
" <td>male</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>330877</td>\n", | |
" <td>8.4583</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Q</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>7</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>McCarthy, Mr. Timothy J</td>\n", | |
" <td>male</td>\n", | |
" <td>54.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>17463</td>\n", | |
" <td>51.8625</td>\n", | |
" <td>E46</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>8</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Palsson, Master. Gosta Leonard</td>\n", | |
" <td>male</td>\n", | |
" <td>2.0</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>349909</td>\n", | |
" <td>21.0750</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>9</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n", | |
" <td>female</td>\n", | |
" <td>27.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>347742</td>\n", | |
" <td>11.1333</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>10</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n", | |
" <td>female</td>\n", | |
" <td>14.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>237736</td>\n", | |
" <td>30.0708</td>\n", | |
" <td>NaN</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass \\\n", | |
"0 1 0 3 \n", | |
"1 2 1 1 \n", | |
"2 3 1 3 \n", | |
"3 4 1 1 \n", | |
"4 5 0 3 \n", | |
"5 6 0 3 \n", | |
"6 7 0 1 \n", | |
"7 8 0 3 \n", | |
"8 9 1 3 \n", | |
"9 10 1 2 \n", | |
"\n", | |
" Name Sex Age SibSp \\\n", | |
"0 Braund, Mr. Owen Harris male 22.0 1 \n", | |
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", | |
"2 Heikkinen, Miss. Laina female 26.0 0 \n", | |
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", | |
"4 Allen, Mr. William Henry male 35.0 0 \n", | |
"5 Moran, Mr. James male NaN 0 \n", | |
"6 McCarthy, Mr. Timothy J male 54.0 0 \n", | |
"7 Palsson, Master. Gosta Leonard male 2.0 3 \n", | |
"8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 \n", | |
"9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 \n", | |
"\n", | |
" Parch Ticket Fare Cabin Embarked \n", | |
"0 0 A/5 21171 7.2500 NaN S \n", | |
"1 0 PC 17599 71.2833 C85 C \n", | |
"2 0 STON/O2. 3101282 7.9250 NaN S \n", | |
"3 0 113803 53.1000 C123 S \n", | |
"4 0 373450 8.0500 NaN S \n", | |
"5 0 330877 8.4583 NaN Q \n", | |
"6 0 17463 51.8625 E46 S \n", | |
"7 1 349909 21.0750 NaN S \n", | |
"8 2 347742 11.1333 NaN S \n", | |
"9 0 237736 30.0708 NaN C " | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Selecting a Series versus a DataFrame:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 0\n", | |
"1 1\n", | |
"2 1\n", | |
"3 1\n", | |
"4 0\n", | |
"5 0\n", | |
"6 0\n", | |
"7 0\n", | |
"8 1\n", | |
"9 1\n", | |
"Name: Survived, dtype: int64" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['Survived']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Survived</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Survived\n", | |
"0 0\n", | |
"1 1\n", | |
"2 1\n", | |
"3 1\n", | |
"4 0\n", | |
"5 0\n", | |
"6 0\n", | |
"7 0\n", | |
"8 1\n", | |
"9 1" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df[['Survived']]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Series is 1D, DataFrame is 2D:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(10,)" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['Survived'].shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(10, 1)" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df[['Survived']].shape" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Use to_numpy() to convert a pandas object to a NumPy array:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1])" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['Survived'].to_numpy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[0],\n", | |
" [1],\n", | |
" [1],\n", | |
" [1],\n", | |
" [0],\n", | |
" [0],\n", | |
" [0],\n", | |
" [0],\n", | |
" [1],\n", | |
" [1]])" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df[['Survived']].to_numpy()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- 1D target is used for classification problems with a single label\n", | |
"- 2D target is used for multilabel classification problems" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Shashi: Can you explain the solver and random_state parameters of logistic regression?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"logreg = LogisticRegression(solver='liblinear', random_state=1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Solver:\n", | |
"\n", | |
"- Calculates the coefficients\n", | |
"- Solvers have different strengths and weaknesses, see comparison here: [Logistic regression section of scikit-learn User Guide](https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression)\n", | |
"- If you get a convergence warning, try changing the solver\n", | |
"\n", | |
"random_state:\n", | |
"\n", | |
"- Use it to ensure reproducibility any time you have a pseudo-random process\n", | |
"- Set it to any integer" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Hause: Is the intercept included in the list of logistic regression coefficients?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"No, it's stored separately in the \"intercept_\" attribute." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### VK: Why did you use LogisticRegression instead of LogisticRegressionCV, which has built-in cross-validation?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"I prefer not to use super-specialized functions like LogisticRegressionCV, and instead use GridSearchCV which works with any model and integrates well into the scikit-learn workflow." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Arjun: Is there a reason that you didn't do cross-validation after each change that you made to your model?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Normally you would do cross-validation after each change, but in this case cross-validation scores would have been highly misleading due to the dataset size." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Anton: Do you always fit the model on the entire dataset after setting the hyperparameters via cross-validation?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Yes, you fit the model to all samples for which you know the target value, otherwise you are throwing away useful training data." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Rachel: How do I choose the right \"scoring\" parameter?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Figure out what is important to you, and then choose an evaluation metric that matches those priorities.\n", | |
"\n", | |
"Examples:\n", | |
"\n", | |
"- spam filter: optimize for precision\n", | |
"- fraud detector: optimize for recall\n", | |
"\n", | |
"Recommended resources:\n", | |
"\n", | |
"- [Metrics section of scikit-learn User Guide](https://scikit-learn.org/stable/modules/model_evaluation.html)\n", | |
"- [Discrimination Threshold visualizer from Yellowbrick](https://www.scikit-yb.org/en/latest/api/classifier/threshold.html)\n", | |
"- [My video on classifier evaluation](https://github.com/justmarkham/scikit-learn-videos/blob/master/09_classification_metrics.ipynb)\n", | |
"- [My video on linear regression and regression metrics](https://github.com/justmarkham/scikit-learn-videos/blob/master/06_linear_regression.ipynb)\n", | |
"- [My simple guide to confusion matrix terminology](https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/)\n", | |
"- [My video on the confusion matrix (includes advanced topics)](https://www.youtube.com/watch?v=8Oog7TXHvFY)\n", | |
"- [My brief comparison of seven evaluation metrics](https://github.com/justmarkham/DAT8/blob/master/other/model_evaluation_comparison.md)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Arjun: When one-hot encoding, what happens if the testing data has a new category that was not in the training data?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Use fit_transform on training data:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>letter</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>A</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>B</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" letter\n", | |
"0 A\n", | |
"1 B\n", | |
"2 C\n", | |
"3 B" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"demo_train = pd.DataFrame({'letter':['A', 'B', 'C', 'B']})\n", | |
"demo_train" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ohe = OneHotEncoder(sparse=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[1., 0., 0.],\n", | |
" [0., 1., 0.],\n", | |
" [0., 0., 1.],\n", | |
" [0., 1., 0.]])" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ohe.fit_transform(demo_train[['letter']])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"If you use fit_transform on testing data, it won't learn the same categories, which will be problematic:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>letter</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>A</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>A</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" letter\n", | |
"0 A\n", | |
"1 C\n", | |
"2 A" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"demo_test = pd.DataFrame({'letter':['A', 'C', 'A']})\n", | |
"demo_test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[1., 0.],\n", | |
" [0., 1.],\n", | |
" [1., 0.]])" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ohe.fit_transform(demo_test[['letter']])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Always use fit_transform on training data and transform (only) on testing data so that categories will be represented the same way:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[1., 0., 0.],\n", | |
" [0., 1., 0.],\n", | |
" [0., 0., 1.],\n", | |
" [0., 1., 0.]])" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ohe.fit_transform(demo_train[['letter']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[1., 0., 0.],\n", | |
" [0., 0., 1.],\n", | |
" [1., 0., 0.]])" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ohe.transform(demo_test[['letter']])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"If testing data contains a new category, the encoder will error during transformation:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>letter</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>A</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>D</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" letter\n", | |
"0 A\n", | |
"1 C\n", | |
"2 D" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"demo_test_unknown = pd.DataFrame({'letter':['A', 'C', 'D']})\n", | |
"demo_test_unknown" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# ohe.transform(demo_test_unknown[['letter']])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The solution is to tell the encoder to represent unknown categories as all zeros:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[1., 0., 0.],\n", | |
" [0., 1., 0.],\n", | |
" [0., 0., 1.],\n", | |
" [0., 1., 0.]])" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ohe.fit_transform(demo_train[['letter']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[1., 0., 0.],\n", | |
" [0., 0., 1.],\n", | |
" [0., 0., 0.]])" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ohe.transform(demo_test_unknown[['letter']])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Advice:\n", | |
"\n", | |
"- By default, keep handle_unknown='error' so you know if you are encountering new categories\n", | |
"- If you encounter new categories, set handle_unknown='ignore' but keep in mind that all unknown categories will be encoded the same way\n", | |
"- As soon as possible, retrain your model with data that includes any new categories" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Chris: Should we drop one of the one-hot encoded features, since some models (such as linear regression) don't like when there is collinearity between features?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Here is the default one-hot encoding:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>letter</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>A</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>B</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>B</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" letter\n", | |
"0 A\n", | |
"1 B\n", | |
"2 C\n", | |
"3 B" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"demo_train" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[1., 0., 0.],\n", | |
" [0., 1., 0.],\n", | |
" [0., 0., 1.],\n", | |
" [0., 1., 0.]])" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ohe.fit_transform(demo_train[['letter']])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"You can also drop the first level (new in version 0.21):" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ohe = OneHotEncoder(sparse=False, drop='first')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[0., 0.],\n", | |
" [1., 0.],\n", | |
" [0., 1.],\n", | |
" [1., 0.]])" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ohe.fit_transform(demo_train[['letter']])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- Drop the first level when you know perfectly collinear features will cause problems\n", | |
"- For most models, dropping the first level won't improve performance\n", | |
"- Dropping the first level is incompatible with ignoring unknown categories\n", | |
"- Dropping the first level is likely problematic if you scale your features or use a regularized model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Paolo: What encoding should I use with an ordinal feature?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"If you have an ordinal feature (categorical feature with a logical ordering) that is already encoded numerically (such as Pclass), then leave it as-is:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Braund, Mr. Owen Harris</td>\n", | |
" <td>male</td>\n", | |
" <td>22.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>A/5 21171</td>\n", | |
" <td>7.2500</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", | |
" <td>female</td>\n", | |
" <td>38.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>PC 17599</td>\n", | |
" <td>71.2833</td>\n", | |
" <td>C85</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Heikkinen, Miss. Laina</td>\n", | |
" <td>female</td>\n", | |
" <td>26.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>STON/O2. 3101282</td>\n", | |
" <td>7.9250</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", | |
" <td>female</td>\n", | |
" <td>35.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>113803</td>\n", | |
" <td>53.1000</td>\n", | |
" <td>C123</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Allen, Mr. William Henry</td>\n", | |
" <td>male</td>\n", | |
" <td>35.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>373450</td>\n", | |
" <td>8.0500</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>6</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Moran, Mr. James</td>\n", | |
" <td>male</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>330877</td>\n", | |
" <td>8.4583</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Q</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>7</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>McCarthy, Mr. Timothy J</td>\n", | |
" <td>male</td>\n", | |
" <td>54.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>17463</td>\n", | |
" <td>51.8625</td>\n", | |
" <td>E46</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>8</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Palsson, Master. Gosta Leonard</td>\n", | |
" <td>male</td>\n", | |
" <td>2.0</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>349909</td>\n", | |
" <td>21.0750</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>9</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n", | |
" <td>female</td>\n", | |
" <td>27.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>347742</td>\n", | |
" <td>11.1333</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>10</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n", | |
" <td>female</td>\n", | |
" <td>14.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>237736</td>\n", | |
" <td>30.0708</td>\n", | |
" <td>NaN</td>\n", | |
" <td>C</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass \\\n", | |
"0 1 0 3 \n", | |
"1 2 1 1 \n", | |
"2 3 1 3 \n", | |
"3 4 1 1 \n", | |
"4 5 0 3 \n", | |
"5 6 0 3 \n", | |
"6 7 0 1 \n", | |
"7 8 0 3 \n", | |
"8 9 1 3 \n", | |
"9 10 1 2 \n", | |
"\n", | |
" Name Sex Age SibSp \\\n", | |
"0 Braund, Mr. Owen Harris male 22.0 1 \n", | |
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", | |
"2 Heikkinen, Miss. Laina female 26.0 0 \n", | |
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", | |
"4 Allen, Mr. William Henry male 35.0 0 \n", | |
"5 Moran, Mr. James male NaN 0 \n", | |
"6 McCarthy, Mr. Timothy J male 54.0 0 \n", | |
"7 Palsson, Master. Gosta Leonard male 2.0 3 \n", | |
"8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 \n", | |
"9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 \n", | |
"\n", | |
" Parch Ticket Fare Cabin Embarked \n", | |
"0 0 A/5 21171 7.2500 NaN S \n", | |
"1 0 PC 17599 71.2833 C85 C \n", | |
"2 0 STON/O2. 3101282 7.9250 NaN S \n", | |
"3 0 113803 53.1000 C123 S \n", | |
"4 0 373450 8.0500 NaN S \n", | |
"5 0 330877 8.4583 NaN Q \n", | |
"6 0 17463 51.8625 E46 S \n", | |
"7 1 349909 21.0750 NaN S \n", | |
"8 2 347742 11.1333 NaN S \n", | |
"9 0 237736 30.0708 NaN C " | |
] | |
}, | |
"execution_count": 33, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"If you have an ordinal feature that is encoded as strings, then use OrdinalEncoder:\n", | |
"\n", | |
"- You define the logical order of the categories\n", | |
"- Each input feature becomes a single output feature (unlike OneHotEncoder)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Class</th>\n", | |
" <th>Size</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>third</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>first</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>second</td>\n", | |
" <td>L</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>third</td>\n", | |
" <td>XL</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Class Size\n", | |
"0 third S\n", | |
"1 first S\n", | |
"2 second L\n", | |
"3 third XL" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_ordinal = pd.DataFrame({'Class': ['third', 'first', 'second', 'third'],\n", | |
" 'Size': ['S', 'S', 'L', 'XL']})\n", | |
"df_ordinal" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[2., 0.],\n", | |
" [0., 0.],\n", | |
" [1., 2.],\n", | |
" [2., 3.]])" | |
] | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.preprocessing import OrdinalEncoder\n", | |
"ore = OrdinalEncoder(categories=[['first', 'second', 'third'], ['S', 'M', 'L', 'XL']])\n", | |
"ore.fit_transform(df_ordinal)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Vijey: What's the difference between OneHotEncoder, OrdinalEncoder, and LabelEncoder?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- Use OneHotEncoder for unordered categorical features (nominal)\n", | |
"- Use OrdinalEncoder for ordered categorical features (ordinal)\n", | |
"- LabelEncoder is only for labels (meaning targets), and is rarely useful any more since scikit-learn classification models can handle string-based labels" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Shreyas: In a ColumnTransformer, what are the other options for \"remainder\"?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"\"passthrough\" means include all unspecified columns but don't modify them:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ohe = OneHotEncoder()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (ohe, ['Embarked', 'Sex']),\n", | |
" (vect, 'Name'),\n", | |
" remainder='passthrough')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Index(['Parch', 'Fare', 'Embarked', 'Sex', 'Name'], dtype='object')" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X.columns" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<10x47 sparse matrix of type '<class 'numpy.float64'>'\n", | |
"\twith 78 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 39, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.fit_transform(X)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"\"drop\" means drop all unspecified columns:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (ohe, ['Embarked', 'Sex']),\n", | |
" (vect, 'Name'),\n", | |
" remainder='drop')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<10x45 sparse matrix of type '<class 'numpy.float64'>'\n", | |
"\twith 66 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 41, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.fit_transform(X)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"You can also set remainder to be a transformer object, in which case all unspecified columns will be transformed." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Hause: How do I get the column names for the output of ColumnTransformer?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"get_feature_names works in some cases:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['onehotencoder__x0_C',\n", | |
" 'onehotencoder__x0_Q',\n", | |
" 'onehotencoder__x0_S',\n", | |
" 'onehotencoder__x1_female',\n", | |
" 'onehotencoder__x1_male',\n", | |
" 'countvectorizer__achem',\n", | |
" 'countvectorizer__adele',\n", | |
" 'countvectorizer__allen',\n", | |
" 'countvectorizer__berg',\n", | |
" 'countvectorizer__bradley',\n", | |
" 'countvectorizer__braund',\n", | |
" 'countvectorizer__briggs',\n", | |
" 'countvectorizer__cumings',\n", | |
" 'countvectorizer__elisabeth',\n", | |
" 'countvectorizer__florence',\n", | |
" 'countvectorizer__futrelle',\n", | |
" 'countvectorizer__gosta',\n", | |
" 'countvectorizer__harris',\n", | |
" 'countvectorizer__heath',\n", | |
" 'countvectorizer__heikkinen',\n", | |
" 'countvectorizer__henry',\n", | |
" 'countvectorizer__jacques',\n", | |
" 'countvectorizer__james',\n", | |
" 'countvectorizer__john',\n", | |
" 'countvectorizer__johnson',\n", | |
" 'countvectorizer__laina',\n", | |
" 'countvectorizer__leonard',\n", | |
" 'countvectorizer__lily',\n", | |
" 'countvectorizer__master',\n", | |
" 'countvectorizer__may',\n", | |
" 'countvectorizer__mccarthy',\n", | |
" 'countvectorizer__miss',\n", | |
" 'countvectorizer__moran',\n", | |
" 'countvectorizer__mr',\n", | |
" 'countvectorizer__mrs',\n", | |
" 'countvectorizer__nasser',\n", | |
" 'countvectorizer__nicholas',\n", | |
" 'countvectorizer__oscar',\n", | |
" 'countvectorizer__owen',\n", | |
" 'countvectorizer__palsson',\n", | |
" 'countvectorizer__peel',\n", | |
" 'countvectorizer__thayer',\n", | |
" 'countvectorizer__timothy',\n", | |
" 'countvectorizer__vilhelmina',\n", | |
" 'countvectorizer__william']" | |
] | |
}, | |
"execution_count": 42, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.get_feature_names()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"get_feature_names will not (yet) work with a passthrough transformer:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (ohe, ['Embarked', 'Sex']),\n", | |
" (vect, 'Name'),\n", | |
" remainder='passthrough')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<10x47 sparse matrix of type '<class 'numpy.float64'>'\n", | |
"\twith 78 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 44, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.fit_transform(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# ct.get_feature_names()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"In that case, you will have to inspect the transformers one-by-one to figure out the column names:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('onehotencoder',\n", | |
" OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,\n", | |
" handle_unknown='error', sparse=True),\n", | |
" ['Embarked', 'Sex']),\n", | |
" ('countvectorizer',\n", | |
" CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", | |
" dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n", | |
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", | |
" ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", | |
" strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", | |
" tokenizer=None, vocabulary=None),\n", | |
" 'Name'),\n", | |
" ('remainder', 'passthrough', [0, 1])]" | |
] | |
}, | |
"execution_count": 46, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.transformers_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['x0_C', 'x0_Q', 'x0_S', 'x1_female', 'x1_male'], dtype=object)" | |
] | |
}, | |
"execution_count": 47, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.named_transformers_.onehotencoder.get_feature_names()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['achem',\n", | |
" 'adele',\n", | |
" 'allen',\n", | |
" 'berg',\n", | |
" 'bradley',\n", | |
" 'braund',\n", | |
" 'briggs',\n", | |
" 'cumings',\n", | |
" 'elisabeth',\n", | |
" 'florence',\n", | |
" 'futrelle',\n", | |
" 'gosta',\n", | |
" 'harris',\n", | |
" 'heath',\n", | |
" 'heikkinen',\n", | |
" 'henry',\n", | |
" 'jacques',\n", | |
" 'james',\n", | |
" 'john',\n", | |
" 'johnson',\n", | |
" 'laina',\n", | |
" 'leonard',\n", | |
" 'lily',\n", | |
" 'master',\n", | |
" 'may',\n", | |
" 'mccarthy',\n", | |
" 'miss',\n", | |
" 'moran',\n", | |
" 'mr',\n", | |
" 'mrs',\n", | |
" 'nasser',\n", | |
" 'nicholas',\n", | |
" 'oscar',\n", | |
" 'owen',\n", | |
" 'palsson',\n", | |
" 'peel',\n", | |
" 'thayer',\n", | |
" 'timothy',\n", | |
" 'vilhelmina',\n", | |
" 'william']" | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.named_transformers_.countvectorizer.get_feature_names()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Index(['Parch', 'Fare', 'Embarked', 'Sex', 'Name'], dtype='object')" | |
] | |
}, | |
"execution_count": 49, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X.columns" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Arjun: Is there a more efficient way to specify columns for a ColumnTransformer than listing them one-by-one?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"You can specify columns by position or slice:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (ohe, [2, 3]),\n", | |
" (vect, 4),\n", | |
" remainder='passthrough')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (ohe, slice(2, 4)),\n", | |
" (vect, 4),\n", | |
" remainder='passthrough')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"make_column_selector (new in version 0.22) allows you to select columns by regex pattern or data type:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.compose import make_column_selector\n", | |
"cs = make_column_selector(pattern='E|S')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (ohe, cs),\n", | |
" (vect, 4),\n", | |
" remainder='passthrough')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Tony: Does pipe.fit() modify the underlying objects (ct, logreg)?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Yes, it does modify the underlying objects:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pipe = make_pipeline(ct, logreg)\n", | |
"pipe.fit(X, y);" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.18828769, -0.14100295, -0.16593861, 0.66504677, -0.78370063,\n", | |
" 0.11596792, 0.11596792, -0.1262833 , 0.13845919, 0.07231978,\n", | |
" -0.12539973, 0.07231978, 0.07231978, 0.13845919, 0.07231978,\n", | |
" 0.10454614, -0.18913104, -0.12539973, 0.10454614, 0.23375375,\n", | |
" -0.1262833 , 0.10454614, -0.14100295, 0.07231978, 0.13845919,\n", | |
" 0.23375375, -0.18913104, 0.10454614, -0.18913104, 0.10454614,\n", | |
" -0.20188362, 0.23375375, -0.14100295, -0.5945696 , 0.43129302,\n", | |
" 0.11596792, 0.11596792, 0.13845919, -0.12539973, -0.18913104,\n", | |
" 0.10454614, 0.07231978, -0.20188362, 0.13845919, -0.1262833 ,\n", | |
" 0.08778734, 0.01334678]])" | |
] | |
}, | |
"execution_count": 55, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe.named_steps.logisticregression.coef_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 56, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.18828769, -0.14100295, -0.16593861, 0.66504677, -0.78370063,\n", | |
" 0.11596792, 0.11596792, -0.1262833 , 0.13845919, 0.07231978,\n", | |
" -0.12539973, 0.07231978, 0.07231978, 0.13845919, 0.07231978,\n", | |
" 0.10454614, -0.18913104, -0.12539973, 0.10454614, 0.23375375,\n", | |
" -0.1262833 , 0.10454614, -0.14100295, 0.07231978, 0.13845919,\n", | |
" 0.23375375, -0.18913104, 0.10454614, -0.18913104, 0.10454614,\n", | |
" -0.20188362, 0.23375375, -0.14100295, -0.5945696 , 0.43129302,\n", | |
" 0.11596792, 0.11596792, 0.13845919, -0.12539973, -0.18913104,\n", | |
" 0.10454614, 0.07231978, -0.20188362, 0.13845919, -0.1262833 ,\n", | |
" 0.08778734, 0.01334678]])" | |
] | |
}, | |
"execution_count": 56, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"logreg.coef_" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Arjun: Regarding the code \"pipe.named_steps.logisticregression.coef_\", can you explain what it is doing and why it references \"logisticregression\" rather than \"logreg\"?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"This is a two-step pipeline:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Pipeline(memory=None,\n", | |
" steps=[('columntransformer',\n", | |
" ColumnTransformer(n_jobs=None, remainder='passthrough',\n", | |
" sparse_threshold=0.3,\n", | |
" transformer_weights=None,\n", | |
" transformers=[('onehotencoder',\n", | |
" OneHotEncoder(categories='auto',\n", | |
" drop=None,\n", | |
" dtype=<class 'numpy.float64'>,\n", | |
" handle_unknown='error',\n", | |
" sparse=True),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object...\n", | |
" token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", | |
" tokenizer=None,\n", | |
" vocabulary=None),\n", | |
" 4)],\n", | |
" verbose=False)),\n", | |
" ('logisticregression',\n", | |
" LogisticRegression(C=1.0, class_weight=None, dual=False,\n", | |
" fit_intercept=True, intercept_scaling=1,\n", | |
" l1_ratio=None, max_iter=100,\n", | |
" multi_class='auto', n_jobs=None,\n", | |
" penalty='l2', random_state=1,\n", | |
" solver='liblinear', tol=0.0001, verbose=0,\n", | |
" warm_start=False))],\n", | |
" verbose=False)" | |
] | |
}, | |
"execution_count": 57, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The step names were assigned by make_pipeline, and you can examine individual steps via the named_steps attribute:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dict_keys(['columntransformer', 'logisticregression'])" | |
] | |
}, | |
"execution_count": 58, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe.named_steps.keys()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,\n", | |
" transformer_weights=None,\n", | |
" transformers=[('onehotencoder',\n", | |
" OneHotEncoder(categories='auto', drop=None,\n", | |
" dtype=<class 'numpy.float64'>,\n", | |
" handle_unknown='error',\n", | |
" sparse=True),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fb95ade1b50>),\n", | |
" ('countvectorizer',\n", | |
" CountVectorizer(analyzer='word', binary=False,\n", | |
" decode_error='strict',\n", | |
" dtype=<class 'numpy.int64'>,\n", | |
" encoding='utf-8',\n", | |
" input='content',\n", | |
" lowercase=True, max_df=1.0,\n", | |
" max_features=None, min_df=1,\n", | |
" ngram_range=(1, 1),\n", | |
" preprocessor=None,\n", | |
" stop_words=None,\n", | |
" strip_accents=None,\n", | |
" token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", | |
" tokenizer=None,\n", | |
" vocabulary=None),\n", | |
" 4)],\n", | |
" verbose=False)" | |
] | |
}, | |
"execution_count": 59, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe.named_steps.columntransformer" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 60, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", | |
" intercept_scaling=1, l1_ratio=None, max_iter=100,\n", | |
" multi_class='auto', n_jobs=None, penalty='l2',\n", | |
" random_state=1, solver='liblinear', tol=0.0001, verbose=0,\n", | |
" warm_start=False)" | |
] | |
}, | |
"execution_count": 60, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe.named_steps.logisticregression" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.18828769, -0.14100295, -0.16593861, 0.66504677, -0.78370063,\n", | |
" 0.11596792, 0.11596792, -0.1262833 , 0.13845919, 0.07231978,\n", | |
" -0.12539973, 0.07231978, 0.07231978, 0.13845919, 0.07231978,\n", | |
" 0.10454614, -0.18913104, -0.12539973, 0.10454614, 0.23375375,\n", | |
" -0.1262833 , 0.10454614, -0.14100295, 0.07231978, 0.13845919,\n", | |
" 0.23375375, -0.18913104, 0.10454614, -0.18913104, 0.10454614,\n", | |
" -0.20188362, 0.23375375, -0.14100295, -0.5945696 , 0.43129302,\n", | |
" 0.11596792, 0.11596792, 0.13845919, -0.12539973, -0.18913104,\n", | |
" 0.10454614, 0.07231978, -0.20188362, 0.13845919, -0.1262833 ,\n", | |
" 0.08778734, 0.01334678]])" | |
] | |
}, | |
"execution_count": 61, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe.named_steps.logisticregression.coef_" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Here are alternative ways to accomplish the same thing:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.18828769, -0.14100295, -0.16593861, 0.66504677, -0.78370063,\n", | |
" 0.11596792, 0.11596792, -0.1262833 , 0.13845919, 0.07231978,\n", | |
" -0.12539973, 0.07231978, 0.07231978, 0.13845919, 0.07231978,\n", | |
" 0.10454614, -0.18913104, -0.12539973, 0.10454614, 0.23375375,\n", | |
" -0.1262833 , 0.10454614, -0.14100295, 0.07231978, 0.13845919,\n", | |
" 0.23375375, -0.18913104, 0.10454614, -0.18913104, 0.10454614,\n", | |
" -0.20188362, 0.23375375, -0.14100295, -0.5945696 , 0.43129302,\n", | |
" 0.11596792, 0.11596792, 0.13845919, -0.12539973, -0.18913104,\n", | |
" 0.10454614, 0.07231978, -0.20188362, 0.13845919, -0.1262833 ,\n", | |
" 0.08778734, 0.01334678]])" | |
] | |
}, | |
"execution_count": 62, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe.named_steps['logisticregression'].coef_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.18828769, -0.14100295, -0.16593861, 0.66504677, -0.78370063,\n", | |
" 0.11596792, 0.11596792, -0.1262833 , 0.13845919, 0.07231978,\n", | |
" -0.12539973, 0.07231978, 0.07231978, 0.13845919, 0.07231978,\n", | |
" 0.10454614, -0.18913104, -0.12539973, 0.10454614, 0.23375375,\n", | |
" -0.1262833 , 0.10454614, -0.14100295, 0.07231978, 0.13845919,\n", | |
" 0.23375375, -0.18913104, 0.10454614, -0.18913104, 0.10454614,\n", | |
" -0.20188362, 0.23375375, -0.14100295, -0.5945696 , 0.43129302,\n", | |
" 0.11596792, 0.11596792, 0.13845919, -0.12539973, -0.18913104,\n", | |
" 0.10454614, 0.07231978, -0.20188362, 0.13845919, -0.1262833 ,\n", | |
" 0.08778734, 0.01334678]])" | |
] | |
}, | |
"execution_count": 63, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe['logisticregression'].coef_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 0.18828769, -0.14100295, -0.16593861, 0.66504677, -0.78370063,\n", | |
" 0.11596792, 0.11596792, -0.1262833 , 0.13845919, 0.07231978,\n", | |
" -0.12539973, 0.07231978, 0.07231978, 0.13845919, 0.07231978,\n", | |
" 0.10454614, -0.18913104, -0.12539973, 0.10454614, 0.23375375,\n", | |
" -0.1262833 , 0.10454614, -0.14100295, 0.07231978, 0.13845919,\n", | |
" 0.23375375, -0.18913104, 0.10454614, -0.18913104, 0.10454614,\n", | |
" -0.20188362, 0.23375375, -0.14100295, -0.5945696 , 0.43129302,\n", | |
" 0.11596792, 0.11596792, 0.13845919, -0.12539973, -0.18913104,\n", | |
" 0.10454614, 0.07231978, -0.20188362, 0.13845919, -0.1262833 ,\n", | |
" 0.08778734, 0.01334678]])" | |
] | |
}, | |
"execution_count": 64, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe[1].coef_" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Arjun: What's the difference between make_pipeline and Pipeline?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"make_pipeline:\n", | |
"\n", | |
"- Assigns default step names (lowercase version of the step's class name)\n", | |
"- Easier to read and write than Pipeline code" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dict_keys(['columntransformer', 'logisticregression'])" | |
] | |
}, | |
"execution_count": 65, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe = make_pipeline(ct, logreg)\n", | |
"pipe.named_steps.keys()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Pipeline:\n", | |
"\n", | |
"- Requires you to assign step names\n", | |
"- Custom step names can be useful for clarity when you are doing a grid search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dict_keys(['preprocessor', 'classifier'])" | |
] | |
}, | |
"execution_count": 66, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.pipeline import Pipeline\n", | |
"pipe = Pipeline([('preprocessor', ct), ('classifier', logreg)])\n", | |
"pipe.named_steps.keys()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Hause: Can you walk us through the documentation for Pipeline and ColumnTransformer?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Five pages/page types you need to be familiar with:\n", | |
"\n", | |
"1. API reference: high-level view\n", | |
"2. Class documentation: detailed view of a class\n", | |
"3. User guide: more examples and advice\n", | |
"4. Examples: more complex examples\n", | |
"5. Glossary: glossary of terms" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Cathal: Is there a pandas method \"pdpipe\" that does something similar to scikit-learn's Pipeline?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- pipe is a pandas method for including user-defined functions in a pandas method chain\n", | |
"- pdpipe is a library for writing pandas code using an API that is similar to scikit-learn's Pipeline" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Abla: Can you build feature interactions in a Pipeline?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Yes, using the PolynomialFeatures class, though I don't usually do so:\n", | |
"\n", | |
"- It doesn't scale well if you have lots of features\n", | |
"- I prefer to use tree-based models that can learn feature interactions on their own" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Charles: Why does CountVectorizer expect 1D input instead of 2D input?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Compare OneHotEncoder to CountVectorizer:\n", | |
"\n", | |
"- Most transformers (like OneHotEncoder) expect 2D input\n", | |
"- CountVectorizer expects 1D input" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<10x3 sparse matrix of type '<class 'numpy.float64'>'\n", | |
"\twith 10 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 67, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ohe.fit_transform(X[['Embarked']])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<10x40 sparse matrix of type '<class 'numpy.int64'>'\n", | |
"\twith 46 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 68, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vect.fit_transform(X['Name'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (ohe, ['Embarked']),\n", | |
" (vect, 'Name'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<10x43 sparse matrix of type '<class 'numpy.float64'>'\n", | |
"\twith 56 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 70, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.fit_transform(X)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"One possible reason: CountVectorizer isn't built to accept more than one column as input, thus it doesn't make sense for it to allow 2D input." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### VK: How do I pass multiple columns to CountVectorizer?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Pass them in as two separate tuples:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (vect, 'Name'),\n", | |
" (vect, 'Sex'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<10x42 sparse matrix of type '<class 'numpy.longlong'>'\n", | |
"\twith 56 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 72, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.fit_transform(X)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"make_column_transformer can't assign both of them the same name, so it appends numbers at the end:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"dict_keys(['countvectorizer-1', 'countvectorizer-2', 'remainder'])" | |
] | |
}, | |
"execution_count": 73, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ct.named_transformers_.keys()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Motasem: Would the document-term matrix have values greater than 1 if a word is repeated in a row?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Yes:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"text = ['Machine Learning is fun', 'I am learning Machine Learning']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>am</th>\n", | |
" <th>fun</th>\n", | |
" <th>is</th>\n", | |
" <th>learning</th>\n", | |
" <th>machine</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" am fun is learning machine\n", | |
"0 0 1 1 1 1\n", | |
"1 1 0 0 2 1" | |
] | |
}, | |
"execution_count": 75, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.DataFrame(vect.fit_transform(text).toarray(), columns=vect.get_feature_names())" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Anna: What does \"stored elements\" mean in the sparse matrix?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Stored elements is the number of non-zero values:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<2x5 sparse matrix of type '<class 'numpy.int64'>'\n", | |
"\twith 7 stored elements in Compressed Sparse Row format>" | |
] | |
}, | |
"execution_count": 76, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dtm = vect.fit_transform(text)\n", | |
"dtm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" (0, 4)\t1\n", | |
" (0, 3)\t1\n", | |
" (0, 2)\t1\n", | |
" (0, 1)\t1\n", | |
" (1, 4)\t1\n", | |
" (1, 3)\t2\n", | |
" (1, 0)\t1\n" | |
] | |
} | |
], | |
"source": [ | |
"print(dtm)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Khaled: What happens if there are words in the testing set that didn't appear in the training set?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"New words in the testing set will be ignored:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 78, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['am', 'fun', 'is', 'learning', 'machine']" | |
] | |
}, | |
"execution_count": 78, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vect.get_feature_names()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 79, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[0, 1, 1, 0, 0]])" | |
] | |
}, | |
"execution_count": 79, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vect.transform(['Data Science is FUN!']).toarray()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Anton: Once I've built a model (or pipeline) that I'm happy with, how can I save it so that I can use it later to make predictions?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Reset our pipeline:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 80, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ohe = OneHotEncoder()\n", | |
"vect = CountVectorizer()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 81, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ct = make_column_transformer(\n", | |
" (ohe, ['Embarked', 'Sex']),\n", | |
" (vect, 'Name'),\n", | |
" remainder='passthrough')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 82, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"logreg = LogisticRegression(solver='liblinear', random_state=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 83, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pipe = make_pipeline(ct, logreg)\n", | |
"pipe.fit(X, y);" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"You can save it to a file using pickle:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 84, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pickle" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 85, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open('pipe.pickle', 'wb') as f:\n", | |
" pickle.dump(pipe, f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 86, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open('pipe.pickle', 'rb') as f:\n", | |
" pipe_from_pickle = pickle.load(f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 87, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])" | |
] | |
}, | |
"execution_count": 87, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe_from_pickle.predict(X_new)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"You can save it to a file using joblib (which is more efficient than pickle for scikit-learn objects):" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 88, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import joblib" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 89, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['pipe.joblib']" | |
] | |
}, | |
"execution_count": 89, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"joblib.dump(pipe, 'pipe.joblib')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 90, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pipe_from_joblib = joblib.load('pipe.joblib')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 91, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0])" | |
] | |
}, | |
"execution_count": 91, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pipe_from_joblib.predict(X_new)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For both pickle and joblib objects:\n", | |
"\n", | |
"- You should only load it into an identical environment\n", | |
"- You should only load objects you trust\n", | |
"\n", | |
"Other alternatives are available that don't save the full model object, but do save a representation that can be used to make predictions:\n", | |
"\n", | |
"- See the \"Model export for production\" section on this page: [scikit-learn Related Projects](https://scikit-learn.org/stable/related_projects.html)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Darnell: Will you be giving us homework or exercises in the course?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"There won't be any in the Live Course, but there may be some exercises or walkthroughs in the Advanced Course." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Khaled: You mentioned that there will be an \"Advanced Course\" after this one. How can I register for it?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- If you purchased the \"Live Course + Advanced Course\" bundle, you will get automatic access to the Advanced Course when it's released\n", | |
"- If you did not purchase the bundle, please email me if you would like to purchase the upgrade" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment