Skip to content

Instantly share code, notes, and snippets.

@hotchpotch
Created February 24, 2021 07:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hotchpotch/e09a956354d451dc63fb876612e67b74 to your computer and use it in GitHub Desktop.
Save hotchpotch/e09a956354d451dc63fb876612e67b74 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5-final"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.5 64-bit (conda)",
"metadata": {
"interpreter": {
"hash": "8488d9a3dd5160f2f4d77e183b691611538e5a603e22a979b5fd2f623abd6fb1"
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import mglearn\n",
"import japanize_matplotlib\n",
"from IPython.display import display\n",
"from pprint import pp, pformat as pf\n",
"dp = display"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# データのロード\n",
"def load_titanic():\n",
" train = pd.read_csv('./input/titanic/train.csv')\n",
" test = pd.read_csv('./input/titanic/test.csv')\n",
" gender_submission = pd.read_csv('./input/titanic/gender_submission.csv')\n",
" return (train, test, gender_submission)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 891 entries, 0 to 890\nData columns (total 12 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 PassengerId 891 non-null int64 \n 1 Survived 891 non-null int64 \n 2 Pclass 891 non-null int64 \n 3 Name 891 non-null object \n 4 Sex 891 non-null object \n 5 Age 714 non-null float64\n 6 SibSp 891 non-null int64 \n 7 Parch 891 non-null int64 \n 8 Ticket 891 non-null object \n 9 Fare 891 non-null float64\n 10 Cabin 204 non-null object \n 11 Embarked 889 non-null object \ndtypes: float64(2), int64(5), object(5)\nmemory usage: 83.7+ KB\n"
]
}
],
"source": [
"train, test, gender_submission = load_titanic()\n",
"train.info()\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 418 entries, 0 to 417\nData columns (total 11 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 PassengerId 418 non-null int64 \n 1 Pclass 418 non-null int64 \n 2 Name 418 non-null object \n 3 Sex 418 non-null object \n 4 Age 332 non-null float64\n 5 SibSp 418 non-null int64 \n 6 Parch 418 non-null int64 \n 7 Ticket 418 non-null object \n 8 Fare 417 non-null float64\n 9 Cabin 91 non-null object \n 10 Embarked 418 non-null object \ndtypes: float64(2), int64(4), object(5)\nmemory usage: 36.0+ KB\n"
]
}
],
"source": [
"test.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"5 6 0 3 \n",
"6 7 0 1 \n",
"7 8 0 3 \n",
"8 9 1 3 \n",
"9 10 1 2 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"5 Moran, Mr. James male NaN 0 \n",
"6 McCarthy, Mr. Timothy J male 54.0 0 \n",
"7 Palsson, Master. Gosta Leonard male 2.0 3 \n",
"8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 \n",
"9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S \n",
"5 0 330877 8.4583 NaN Q \n",
"6 0 17463 51.8625 E46 S \n",
"7 1 349909 21.0750 NaN S \n",
"8 2 347742 11.1333 NaN S \n",
"9 0 237736 30.0708 NaN C "
],
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Survived</th>\n <th>Pclass</th>\n <th>Name</th>\n <th>Sex</th>\n <th>Age</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Ticket</th>\n <th>Fare</th>\n <th>Cabin</th>\n <th>Embarked</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>Braund, Mr. Owen Harris</td>\n <td>male</td>\n <td>22.0</td>\n <td>1</td>\n <td>0</td>\n <td>A/5 21171</td>\n <td>7.2500</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n <td>female</td>\n <td>38.0</td>\n <td>1</td>\n <td>0</td>\n <td>PC 17599</td>\n <td>71.2833</td>\n <td>C85</td>\n <td>C</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>1</td>\n <td>3</td>\n <td>Heikkinen, Miss. Laina</td>\n <td>female</td>\n <td>26.0</td>\n <td>0</td>\n <td>0</td>\n <td>STON/O2. 3101282</td>\n <td>7.9250</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>1</td>\n <td>1</td>\n <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n <td>female</td>\n <td>35.0</td>\n <td>1</td>\n <td>0</td>\n <td>113803</td>\n <td>53.1000</td>\n <td>C123</td>\n <td>S</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>0</td>\n <td>3</td>\n <td>Allen, Mr. William Henry</td>\n <td>male</td>\n <td>35.0</td>\n <td>0</td>\n <td>0</td>\n <td>373450</td>\n <td>8.0500</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>0</td>\n <td>3</td>\n <td>Moran, Mr. James</td>\n <td>male</td>\n <td>NaN</td>\n <td>0</td>\n <td>0</td>\n <td>330877</td>\n <td>8.4583</td>\n <td>NaN</td>\n <td>Q</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td>0</td>\n <td>1</td>\n <td>McCarthy, Mr. Timothy J</td>\n <td>male</td>\n <td>54.0</td>\n <td>0</td>\n <td>0</td>\n <td>17463</td>\n <td>51.8625</td>\n <td>E46</td>\n <td>S</td>\n </tr>\n <tr>\n <th>7</th>\n <td>8</td>\n <td>0</td>\n <td>3</td>\n <td>Palsson, Master. Gosta Leonard</td>\n <td>male</td>\n <td>2.0</td>\n <td>3</td>\n <td>1</td>\n <td>349909</td>\n <td>21.0750</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>8</th>\n <td>9</td>\n <td>1</td>\n <td>3</td>\n <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n <td>female</td>\n <td>27.0</td>\n <td>0</td>\n <td>2</td>\n <td>347742</td>\n <td>11.1333</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>9</th>\n <td>10</td>\n <td>1</td>\n <td>2</td>\n <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n <td>female</td>\n <td>14.0</td>\n <td>1</td>\n <td>0</td>\n <td>237736</td>\n <td>30.0708</td>\n <td>NaN</td>\n <td>C</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 5
}
],
"source": [
"train.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"* train\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "PassengerId 0\nSurvived 0\nPclass 0\nName 0\nSex 0\nAge 177\nSibSp 0\nParch 0\nTicket 0\nFare 0\nCabin 687\nEmbarked 2\ndtype: int64"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"* test\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "PassengerId 0\nPclass 0\nName 0\nSex 0\nAge 86\nSibSp 0\nParch 0\nTicket 0\nFare 1\nCabin 327\nEmbarked 0\ndtype: int64"
},
"metadata": {}
}
],
"source": [
"# 欠損値チェック\n",
"print(\"* train\")\n",
"dp(train.isnull().sum())\n",
"print(\"* test\")\n",
"dp(test.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Survived Pclass Age SibSp \\\n",
"count 891.000000 891.000000 891.000000 714.000000 891.000000 \n",
"mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n",
"std 257.353842 0.486592 0.836071 14.526497 1.102743 \n",
"min 1.000000 0.000000 1.000000 0.420000 0.000000 \n",
"25% 223.500000 0.000000 2.000000 20.125000 0.000000 \n",
"50% 446.000000 0.000000 3.000000 28.000000 0.000000 \n",
"75% 668.500000 1.000000 3.000000 38.000000 1.000000 \n",
"max 891.000000 1.000000 3.000000 80.000000 8.000000 \n",
"\n",
" Parch Fare \n",
"count 891.000000 891.000000 \n",
"mean 0.381594 32.204208 \n",
"std 0.806057 49.693429 \n",
"min 0.000000 0.000000 \n",
"25% 0.000000 7.910400 \n",
"50% 0.000000 14.454200 \n",
"75% 0.000000 31.000000 \n",
"max 6.000000 512.329200 "
],
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Survived</th>\n <th>Pclass</th>\n <th>Age</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Fare</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>891.000000</td>\n <td>891.000000</td>\n <td>891.000000</td>\n <td>714.000000</td>\n <td>891.000000</td>\n <td>891.000000</td>\n <td>891.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>446.000000</td>\n <td>0.383838</td>\n <td>2.308642</td>\n <td>29.699118</td>\n <td>0.523008</td>\n <td>0.381594</td>\n <td>32.204208</td>\n </tr>\n <tr>\n <th>std</th>\n <td>257.353842</td>\n <td>0.486592</td>\n <td>0.836071</td>\n <td>14.526497</td>\n <td>1.102743</td>\n <td>0.806057</td>\n <td>49.693429</td>\n </tr>\n <tr>\n <th>min</th>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>1.000000</td>\n <td>0.420000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>223.500000</td>\n <td>0.000000</td>\n <td>2.000000</td>\n <td>20.125000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>7.910400</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>446.000000</td>\n <td>0.000000</td>\n <td>3.000000</td>\n <td>28.000000</td>\n <td>0.000000</td>\n <td>0.000000</td>\n <td>14.454200</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>668.500000</td>\n <td>1.000000</td>\n <td>3.000000</td>\n <td>38.000000</td>\n <td>1.000000</td>\n <td>0.000000</td>\n <td>31.000000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>891.000000</td>\n <td>1.000000</td>\n <td>3.000000</td>\n <td>80.000000</td>\n <td>8.000000</td>\n <td>6.000000</td>\n <td>512.329200</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 7
}
],
"source": [
"train.describe()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Summarize dataset: 100%|██████████| 25/25 [00:06<00:00, 4.06it/s, Completed]\n",
"Generate report structure: 100%|██████████| 1/1 [00:03<00:00, 3.45s/it]\n",
"Render HTML: 100%|██████████| 1/1 [00:01<00:00, 1.10s/it]\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment