Skip to content

Instantly share code, notes, and snippets.

@monisoi
Created October 15, 2017 10:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save monisoi/e1d5483b2e58f96fee872ee3ea19226e to your computer and use it in GitHub Desktop.
Save monisoi/e1d5483b2e58f96fee872ee3ea19226e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Braund, Mr. Owen Harris</td>\n",
" <td>male</td>\n",
" <td>22.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>A/5 21171</td>\n",
" <td>7.2500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>female</td>\n",
" <td>38.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>PC 17599</td>\n",
" <td>71.2833</td>\n",
" <td>C85</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Heikkinen, Miss. Laina</td>\n",
" <td>female</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>STON/O2. 3101282</td>\n",
" <td>7.9250</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>113803</td>\n",
" <td>53.1000</td>\n",
" <td>C123</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Allen, Mr. William Henry</td>\n",
" <td>male</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>373450</td>\n",
" <td>8.0500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Moran, Mr. James</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>330877</td>\n",
" <td>8.4583</td>\n",
" <td>NaN</td>\n",
" <td>Q</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>McCarthy, Mr. Timothy J</td>\n",
" <td>male</td>\n",
" <td>54.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>17463</td>\n",
" <td>51.8625</td>\n",
" <td>E46</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Palsson, Master. Gosta Leonard</td>\n",
" <td>male</td>\n",
" <td>2.0</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>349909</td>\n",
" <td>21.0750</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n",
" <td>female</td>\n",
" <td>27.0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>347742</td>\n",
" <td>11.1333</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n",
" <td>female</td>\n",
" <td>14.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>237736</td>\n",
" <td>30.0708</td>\n",
" <td>NaN</td>\n",
" <td>C</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"5 6 0 3 \n",
"6 7 0 1 \n",
"7 8 0 3 \n",
"8 9 1 3 \n",
"9 10 1 2 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"5 Moran, Mr. James male NaN 0 \n",
"6 McCarthy, Mr. Timothy J male 54.0 0 \n",
"7 Palsson, Master. Gosta Leonard male 2.0 3 \n",
"8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 \n",
"9 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S \n",
"5 0 330877 8.4583 NaN Q \n",
"6 0 17463 51.8625 E46 S \n",
"7 1 349909 21.0750 NaN S \n",
"8 2 347742 11.1333 NaN S \n",
"9 0 237736 30.0708 NaN C "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# csvを読み込んで先頭10行を見る\n",
"train_df = pd.read_csv(\"train.csv\")\n",
"train_df.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Sexをワンホットエンコーディング\n",
"train_df_proc = train_df\n",
"train_df_proc['Sex'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int) "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" C Q S\n",
"0 0 0 1\n",
"1 1 0 0\n",
"2 0 0 1\n",
"3 0 0 1\n",
"4 0 0 1\n"
]
}
],
"source": [
"# Embarkedをワンホットエンコーディング\n",
"embarked_dummy = pd.get_dummies(train_df['Embarked'])\n",
"print(embarked_dummy.head(5))\n",
"train_df_proc = pd.concat((train_df_proc, embarked_dummy), axis=1)\n",
"train_df_proc = train_df_proc.drop('Embarked', axis=1)\n",
"train_df_proc = train_df_proc.drop('S', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Fare</th>\n",
" <th>C</th>\n",
" <th>Q</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>PassengerId</th>\n",
" <td>1.000000</td>\n",
" <td>0.145894</td>\n",
" <td>-0.087759</td>\n",
" <td>-0.024313</td>\n",
" <td>0.042758</td>\n",
" <td>-0.081876</td>\n",
" <td>-0.050419</td>\n",
" <td>0.029345</td>\n",
" <td>-0.033257</td>\n",
" <td>-0.052956</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Survived</th>\n",
" <td>0.145894</td>\n",
" <td>1.000000</td>\n",
" <td>-0.037227</td>\n",
" <td>-0.535727</td>\n",
" <td>-0.244604</td>\n",
" <td>0.100339</td>\n",
" <td>0.018723</td>\n",
" <td>0.134019</td>\n",
" <td>0.098712</td>\n",
" <td>-0.039232</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Pclass</th>\n",
" <td>-0.087759</td>\n",
" <td>-0.037227</td>\n",
" <td>1.000000</td>\n",
" <td>-0.041725</td>\n",
" <td>-0.307590</td>\n",
" <td>-0.100324</td>\n",
" <td>0.049894</td>\n",
" <td>-0.315069</td>\n",
" <td>-0.228001</td>\n",
" <td>-0.038676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Sex</th>\n",
" <td>-0.024313</td>\n",
" <td>-0.535727</td>\n",
" <td>-0.041725</td>\n",
" <td>1.000000</td>\n",
" <td>0.172307</td>\n",
" <td>-0.095344</td>\n",
" <td>-0.081832</td>\n",
" <td>-0.129871</td>\n",
" <td>-0.053879</td>\n",
" <td>-0.002826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Age</th>\n",
" <td>0.042758</td>\n",
" <td>-0.244604</td>\n",
" <td>-0.307590</td>\n",
" <td>0.172307</td>\n",
" <td>1.000000</td>\n",
" <td>-0.161625</td>\n",
" <td>-0.274813</td>\n",
" <td>-0.091542</td>\n",
" <td>0.076824</td>\n",
" <td>0.017855</td>\n",
" </tr>\n",
" <tr>\n",
" <th>SibSp</th>\n",
" <td>-0.081876</td>\n",
" <td>0.100339</td>\n",
" <td>-0.100324</td>\n",
" <td>-0.095344</td>\n",
" <td>-0.161625</td>\n",
" <td>1.000000</td>\n",
" <td>0.258993</td>\n",
" <td>0.285492</td>\n",
" <td>-0.050628</td>\n",
" <td>0.169778</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Parch</th>\n",
" <td>-0.050419</td>\n",
" <td>0.018723</td>\n",
" <td>0.049894</td>\n",
" <td>-0.081832</td>\n",
" <td>-0.274813</td>\n",
" <td>0.258993</td>\n",
" <td>1.000000</td>\n",
" <td>0.388783</td>\n",
" <td>-0.068949</td>\n",
" <td>-0.065543</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Fare</th>\n",
" <td>0.029345</td>\n",
" <td>0.134019</td>\n",
" <td>-0.315069</td>\n",
" <td>-0.129871</td>\n",
" <td>-0.091542</td>\n",
" <td>0.285492</td>\n",
" <td>0.388783</td>\n",
" <td>1.000000</td>\n",
" <td>0.239531</td>\n",
" <td>0.015604</td>\n",
" </tr>\n",
" <tr>\n",
" <th>C</th>\n",
" <td>-0.033257</td>\n",
" <td>0.098712</td>\n",
" <td>-0.228001</td>\n",
" <td>-0.053879</td>\n",
" <td>0.076824</td>\n",
" <td>-0.050628</td>\n",
" <td>-0.068949</td>\n",
" <td>0.239531</td>\n",
" <td>1.000000</td>\n",
" <td>-0.076941</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Q</th>\n",
" <td>-0.052956</td>\n",
" <td>-0.039232</td>\n",
" <td>-0.038676</td>\n",
" <td>-0.002826</td>\n",
" <td>0.017855</td>\n",
" <td>0.169778</td>\n",
" <td>-0.065543</td>\n",
" <td>0.015604</td>\n",
" <td>-0.076941</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PassengerId Survived Pclass Sex Age SibSp \\\n",
"PassengerId 1.000000 0.145894 -0.087759 -0.024313 0.042758 -0.081876 \n",
"Survived 0.145894 1.000000 -0.037227 -0.535727 -0.244604 0.100339 \n",
"Pclass -0.087759 -0.037227 1.000000 -0.041725 -0.307590 -0.100324 \n",
"Sex -0.024313 -0.535727 -0.041725 1.000000 0.172307 -0.095344 \n",
"Age 0.042758 -0.244604 -0.307590 0.172307 1.000000 -0.161625 \n",
"SibSp -0.081876 0.100339 -0.100324 -0.095344 -0.161625 1.000000 \n",
"Parch -0.050419 0.018723 0.049894 -0.081832 -0.274813 0.258993 \n",
"Fare 0.029345 0.134019 -0.315069 -0.129871 -0.091542 0.285492 \n",
"C -0.033257 0.098712 -0.228001 -0.053879 0.076824 -0.050628 \n",
"Q -0.052956 -0.039232 -0.038676 -0.002826 0.017855 0.169778 \n",
"\n",
" Parch Fare C Q \n",
"PassengerId -0.050419 0.029345 -0.033257 -0.052956 \n",
"Survived 0.018723 0.134019 0.098712 -0.039232 \n",
"Pclass 0.049894 -0.315069 -0.228001 -0.038676 \n",
"Sex -0.081832 -0.129871 -0.053879 -0.002826 \n",
"Age -0.274813 -0.091542 0.076824 0.017855 \n",
"SibSp 0.258993 0.285492 -0.050628 0.169778 \n",
"Parch 1.000000 0.388783 -0.068949 -0.065543 \n",
"Fare 0.388783 1.000000 0.239531 0.015604 \n",
"C -0.068949 0.239531 1.000000 -0.076941 \n",
"Q -0.065543 0.015604 -0.076941 1.000000 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# NaNを削除してから相関係数を出力\n",
"train_df_proc_dn = train_df_proc.dropna(axis=0, how='any')\n",
"train_df_proc_dn.corr()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment