Skip to content

Instantly share code, notes, and snippets.

@kiwamizamurai
Created November 27, 2018 08:55
Show Gist options
  • Save kiwamizamurai/1a884182e037bb5335f57e8a8d3d033c to your computer and use it in GitHub Desktop.
Save kiwamizamurai/1a884182e037bb5335f57e8a8d3d033c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# REFERENCE\n",
"- http://pbpython.com/categorical-encoding.html\n",
"\n",
"ここでは上のリンクを参考にデータの読み方、前処理について勉強したいと思う。"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"pd.set_option('display.max_columns', 50)\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"sns.set_style(\"whitegrid\")\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>symboling</th>\n",
" <th>normalized_losses</th>\n",
" <th>make</th>\n",
" <th>fuel_type</th>\n",
" <th>aspiration</th>\n",
" <th>num_doors</th>\n",
" <th>body_style</th>\n",
" <th>drive_wheels</th>\n",
" <th>engine_location</th>\n",
" <th>wheel_base</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>height</th>\n",
" <th>curb_weight</th>\n",
" <th>engine_type</th>\n",
" <th>num_cylinders</th>\n",
" <th>engine_size</th>\n",
" <th>fuel_system</th>\n",
" <th>bore</th>\n",
" <th>stroke</th>\n",
" <th>compression_ratio</th>\n",
" <th>horsepower</th>\n",
" <th>peak_rpm</th>\n",
" <th>city_mpg</th>\n",
" <th>highway_mpg</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>alfa-romero</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>two</td>\n",
" <td>convertible</td>\n",
" <td>rwd</td>\n",
" <td>front</td>\n",
" <td>88.6</td>\n",
" <td>168.8</td>\n",
" <td>64.1</td>\n",
" <td>48.8</td>\n",
" <td>2548</td>\n",
" <td>dohc</td>\n",
" <td>four</td>\n",
" <td>130</td>\n",
" <td>mpfi</td>\n",
" <td>3.47</td>\n",
" <td>2.68</td>\n",
" <td>9.0</td>\n",
" <td>111.0</td>\n",
" <td>5000.0</td>\n",
" <td>21</td>\n",
" <td>27</td>\n",
" <td>13495.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>alfa-romero</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>two</td>\n",
" <td>convertible</td>\n",
" <td>rwd</td>\n",
" <td>front</td>\n",
" <td>88.6</td>\n",
" <td>168.8</td>\n",
" <td>64.1</td>\n",
" <td>48.8</td>\n",
" <td>2548</td>\n",
" <td>dohc</td>\n",
" <td>four</td>\n",
" <td>130</td>\n",
" <td>mpfi</td>\n",
" <td>3.47</td>\n",
" <td>2.68</td>\n",
" <td>9.0</td>\n",
" <td>111.0</td>\n",
" <td>5000.0</td>\n",
" <td>21</td>\n",
" <td>27</td>\n",
" <td>16500.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>alfa-romero</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>two</td>\n",
" <td>hatchback</td>\n",
" <td>rwd</td>\n",
" <td>front</td>\n",
" <td>94.5</td>\n",
" <td>171.2</td>\n",
" <td>65.5</td>\n",
" <td>52.4</td>\n",
" <td>2823</td>\n",
" <td>ohcv</td>\n",
" <td>six</td>\n",
" <td>152</td>\n",
" <td>mpfi</td>\n",
" <td>2.68</td>\n",
" <td>3.47</td>\n",
" <td>9.0</td>\n",
" <td>154.0</td>\n",
" <td>5000.0</td>\n",
" <td>19</td>\n",
" <td>26</td>\n",
" <td>16500.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>164.0</td>\n",
" <td>audi</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>four</td>\n",
" <td>sedan</td>\n",
" <td>fwd</td>\n",
" <td>front</td>\n",
" <td>99.8</td>\n",
" <td>176.6</td>\n",
" <td>66.2</td>\n",
" <td>54.3</td>\n",
" <td>2337</td>\n",
" <td>ohc</td>\n",
" <td>four</td>\n",
" <td>109</td>\n",
" <td>mpfi</td>\n",
" <td>3.19</td>\n",
" <td>3.40</td>\n",
" <td>10.0</td>\n",
" <td>102.0</td>\n",
" <td>5500.0</td>\n",
" <td>24</td>\n",
" <td>30</td>\n",
" <td>13950.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>164.0</td>\n",
" <td>audi</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>four</td>\n",
" <td>sedan</td>\n",
" <td>4wd</td>\n",
" <td>front</td>\n",
" <td>99.4</td>\n",
" <td>176.6</td>\n",
" <td>66.4</td>\n",
" <td>54.3</td>\n",
" <td>2824</td>\n",
" <td>ohc</td>\n",
" <td>five</td>\n",
" <td>136</td>\n",
" <td>mpfi</td>\n",
" <td>3.19</td>\n",
" <td>3.40</td>\n",
" <td>8.0</td>\n",
" <td>115.0</td>\n",
" <td>5500.0</td>\n",
" <td>18</td>\n",
" <td>22</td>\n",
" <td>17450.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>audi</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>two</td>\n",
" <td>sedan</td>\n",
" <td>fwd</td>\n",
" <td>front</td>\n",
" <td>99.8</td>\n",
" <td>177.3</td>\n",
" <td>66.3</td>\n",
" <td>53.1</td>\n",
" <td>2507</td>\n",
" <td>ohc</td>\n",
" <td>five</td>\n",
" <td>136</td>\n",
" <td>mpfi</td>\n",
" <td>3.19</td>\n",
" <td>3.40</td>\n",
" <td>8.5</td>\n",
" <td>110.0</td>\n",
" <td>5500.0</td>\n",
" <td>19</td>\n",
" <td>25</td>\n",
" <td>15250.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>158.0</td>\n",
" <td>audi</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>four</td>\n",
" <td>sedan</td>\n",
" <td>fwd</td>\n",
" <td>front</td>\n",
" <td>105.8</td>\n",
" <td>192.7</td>\n",
" <td>71.4</td>\n",
" <td>55.7</td>\n",
" <td>2844</td>\n",
" <td>ohc</td>\n",
" <td>five</td>\n",
" <td>136</td>\n",
" <td>mpfi</td>\n",
" <td>3.19</td>\n",
" <td>3.40</td>\n",
" <td>8.5</td>\n",
" <td>110.0</td>\n",
" <td>5500.0</td>\n",
" <td>19</td>\n",
" <td>25</td>\n",
" <td>17710.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>audi</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>four</td>\n",
" <td>wagon</td>\n",
" <td>fwd</td>\n",
" <td>front</td>\n",
" <td>105.8</td>\n",
" <td>192.7</td>\n",
" <td>71.4</td>\n",
" <td>55.7</td>\n",
" <td>2954</td>\n",
" <td>ohc</td>\n",
" <td>five</td>\n",
" <td>136</td>\n",
" <td>mpfi</td>\n",
" <td>3.19</td>\n",
" <td>3.40</td>\n",
" <td>8.5</td>\n",
" <td>110.0</td>\n",
" <td>5500.0</td>\n",
" <td>19</td>\n",
" <td>25</td>\n",
" <td>18920.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>158.0</td>\n",
" <td>audi</td>\n",
" <td>gas</td>\n",
" <td>turbo</td>\n",
" <td>four</td>\n",
" <td>sedan</td>\n",
" <td>fwd</td>\n",
" <td>front</td>\n",
" <td>105.8</td>\n",
" <td>192.7</td>\n",
" <td>71.4</td>\n",
" <td>55.9</td>\n",
" <td>3086</td>\n",
" <td>ohc</td>\n",
" <td>five</td>\n",
" <td>131</td>\n",
" <td>mpfi</td>\n",
" <td>3.13</td>\n",
" <td>3.40</td>\n",
" <td>8.3</td>\n",
" <td>140.0</td>\n",
" <td>5500.0</td>\n",
" <td>17</td>\n",
" <td>20</td>\n",
" <td>23875.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>audi</td>\n",
" <td>gas</td>\n",
" <td>turbo</td>\n",
" <td>two</td>\n",
" <td>hatchback</td>\n",
" <td>4wd</td>\n",
" <td>front</td>\n",
" <td>99.5</td>\n",
" <td>178.2</td>\n",
" <td>67.9</td>\n",
" <td>52.0</td>\n",
" <td>3053</td>\n",
" <td>ohc</td>\n",
" <td>five</td>\n",
" <td>131</td>\n",
" <td>mpfi</td>\n",
" <td>3.13</td>\n",
" <td>3.40</td>\n",
" <td>7.0</td>\n",
" <td>160.0</td>\n",
" <td>5500.0</td>\n",
" <td>16</td>\n",
" <td>22</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" symboling normalized_losses make fuel_type aspiration num_doors \\\n",
"0 3 NaN alfa-romero gas std two \n",
"1 3 NaN alfa-romero gas std two \n",
"2 1 NaN alfa-romero gas std two \n",
"3 2 164.0 audi gas std four \n",
"4 2 164.0 audi gas std four \n",
"5 2 NaN audi gas std two \n",
"6 1 158.0 audi gas std four \n",
"7 1 NaN audi gas std four \n",
"8 1 158.0 audi gas turbo four \n",
"9 0 NaN audi gas turbo two \n",
"\n",
" body_style drive_wheels engine_location wheel_base length width \\\n",
"0 convertible rwd front 88.6 168.8 64.1 \n",
"1 convertible rwd front 88.6 168.8 64.1 \n",
"2 hatchback rwd front 94.5 171.2 65.5 \n",
"3 sedan fwd front 99.8 176.6 66.2 \n",
"4 sedan 4wd front 99.4 176.6 66.4 \n",
"5 sedan fwd front 99.8 177.3 66.3 \n",
"6 sedan fwd front 105.8 192.7 71.4 \n",
"7 wagon fwd front 105.8 192.7 71.4 \n",
"8 sedan fwd front 105.8 192.7 71.4 \n",
"9 hatchback 4wd front 99.5 178.2 67.9 \n",
"\n",
" height curb_weight engine_type num_cylinders engine_size fuel_system \\\n",
"0 48.8 2548 dohc four 130 mpfi \n",
"1 48.8 2548 dohc four 130 mpfi \n",
"2 52.4 2823 ohcv six 152 mpfi \n",
"3 54.3 2337 ohc four 109 mpfi \n",
"4 54.3 2824 ohc five 136 mpfi \n",
"5 53.1 2507 ohc five 136 mpfi \n",
"6 55.7 2844 ohc five 136 mpfi \n",
"7 55.7 2954 ohc five 136 mpfi \n",
"8 55.9 3086 ohc five 131 mpfi \n",
"9 52.0 3053 ohc five 131 mpfi \n",
"\n",
" bore stroke compression_ratio horsepower peak_rpm city_mpg \\\n",
"0 3.47 2.68 9.0 111.0 5000.0 21 \n",
"1 3.47 2.68 9.0 111.0 5000.0 21 \n",
"2 2.68 3.47 9.0 154.0 5000.0 19 \n",
"3 3.19 3.40 10.0 102.0 5500.0 24 \n",
"4 3.19 3.40 8.0 115.0 5500.0 18 \n",
"5 3.19 3.40 8.5 110.0 5500.0 19 \n",
"6 3.19 3.40 8.5 110.0 5500.0 19 \n",
"7 3.19 3.40 8.5 110.0 5500.0 19 \n",
"8 3.13 3.40 8.3 140.0 5500.0 17 \n",
"9 3.13 3.40 7.0 160.0 5500.0 16 \n",
"\n",
" highway_mpg price \n",
"0 27 13495.0 \n",
"1 27 16500.0 \n",
"2 26 16500.0 \n",
"3 30 13950.0 \n",
"4 22 17450.0 \n",
"5 25 15250.0 \n",
"6 25 17710.0 \n",
"7 25 18920.0 \n",
"8 20 23875.0 \n",
"9 22 NaN "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"headers = [\"symboling\", \"normalized_losses\", \"make\", \"fuel_type\", \"aspiration\",\n",
" \"num_doors\", \"body_style\", \"drive_wheels\", \"engine_location\",\n",
" \"wheel_base\", \"length\", \"width\", \"height\", \"curb_weight\",\n",
" \"engine_type\", \"num_cylinders\", \"engine_size\", \"fuel_system\",\n",
" \"bore\", \"stroke\", \"compression_ratio\", \"horsepower\", \"peak_rpm\",\n",
" \"city_mpg\", \"highway_mpg\", \"price\"]\n",
"\n",
"df = pd.read_csv(\"http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data\",\n",
" header=None, names=headers, na_values=\"?\" )\n",
"df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 各列、特徴量のデータのタイプを確認する。"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"symboling int64\n",
"normalized_losses float64\n",
"make object\n",
"fuel_type object\n",
"aspiration object\n",
"num_doors object\n",
"body_style object\n",
"drive_wheels object\n",
"engine_location object\n",
"wheel_base float64\n",
"length float64\n",
"width float64\n",
"height float64\n",
"curb_weight int64\n",
"engine_type object\n",
"num_cylinders object\n",
"engine_size int64\n",
"fuel_system object\n",
"bore float64\n",
"stroke float64\n",
"compression_ratio float64\n",
"horsepower float64\n",
"peak_rpm float64\n",
"city_mpg int64\n",
"highway_mpg int64\n",
"price float64\n",
"dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NULLがある列を確認する。"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"symboling False\n",
"normalized_losses True\n",
"make False\n",
"fuel_type False\n",
"aspiration False\n",
"num_doors True\n",
"body_style False\n",
"drive_wheels False\n",
"engine_location False\n",
"wheel_base False\n",
"length False\n",
"width False\n",
"height False\n",
"curb_weight False\n",
"engine_type False\n",
"num_cylinders False\n",
"engine_size False\n",
"fuel_system False\n",
"bore True\n",
"stroke True\n",
"compression_ratio False\n",
"horsepower True\n",
"peak_rpm True\n",
"city_mpg False\n",
"highway_mpg False\n",
"price True\n",
"dtype: bool"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().any()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"normalized_losses 41\n",
"num_doors 2\n",
"bore 4\n",
"stroke 4\n",
"horsepower 2\n",
"peak_rpm 2\n",
"price 4\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()[df.isnull().sum() != 0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>symboling</th>\n",
" <th>normalized_losses</th>\n",
" <th>make</th>\n",
" <th>fuel_type</th>\n",
" <th>aspiration</th>\n",
" <th>num_doors</th>\n",
" <th>body_style</th>\n",
" <th>drive_wheels</th>\n",
" <th>engine_location</th>\n",
" <th>wheel_base</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>height</th>\n",
" <th>curb_weight</th>\n",
" <th>engine_type</th>\n",
" <th>num_cylinders</th>\n",
" <th>engine_size</th>\n",
" <th>fuel_system</th>\n",
" <th>bore</th>\n",
" <th>stroke</th>\n",
" <th>compression_ratio</th>\n",
" <th>horsepower</th>\n",
" <th>peak_rpm</th>\n",
" <th>city_mpg</th>\n",
" <th>highway_mpg</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>alfa-romero</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>two</td>\n",
" <td>convertible</td>\n",
" <td>rwd</td>\n",
" <td>front</td>\n",
" <td>88.6</td>\n",
" <td>168.8</td>\n",
" <td>64.1</td>\n",
" <td>48.8</td>\n",
" <td>2548</td>\n",
" <td>dohc</td>\n",
" <td>four</td>\n",
" <td>130</td>\n",
" <td>mpfi</td>\n",
" <td>3.47</td>\n",
" <td>2.68</td>\n",
" <td>9.0</td>\n",
" <td>111.0</td>\n",
" <td>5000.0</td>\n",
" <td>21</td>\n",
" <td>27</td>\n",
" <td>13495.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>alfa-romero</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>two</td>\n",
" <td>convertible</td>\n",
" <td>rwd</td>\n",
" <td>front</td>\n",
" <td>88.6</td>\n",
" <td>168.8</td>\n",
" <td>64.1</td>\n",
" <td>48.8</td>\n",
" <td>2548</td>\n",
" <td>dohc</td>\n",
" <td>four</td>\n",
" <td>130</td>\n",
" <td>mpfi</td>\n",
" <td>3.47</td>\n",
" <td>2.68</td>\n",
" <td>9.0</td>\n",
" <td>111.0</td>\n",
" <td>5000.0</td>\n",
" <td>21</td>\n",
" <td>27</td>\n",
" <td>16500.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>alfa-romero</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>two</td>\n",
" <td>hatchback</td>\n",
" <td>rwd</td>\n",
" <td>front</td>\n",
" <td>94.5</td>\n",
" <td>171.2</td>\n",
" <td>65.5</td>\n",
" <td>52.4</td>\n",
" <td>2823</td>\n",
" <td>ohcv</td>\n",
" <td>six</td>\n",
" <td>152</td>\n",
" <td>mpfi</td>\n",
" <td>2.68</td>\n",
" <td>3.47</td>\n",
" <td>9.0</td>\n",
" <td>154.0</td>\n",
" <td>5000.0</td>\n",
" <td>19</td>\n",
" <td>26</td>\n",
" <td>16500.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>audi</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>two</td>\n",
" <td>sedan</td>\n",
" <td>fwd</td>\n",
" <td>front</td>\n",
" <td>99.8</td>\n",
" <td>177.3</td>\n",
" <td>66.3</td>\n",
" <td>53.1</td>\n",
" <td>2507</td>\n",
" <td>ohc</td>\n",
" <td>five</td>\n",
" <td>136</td>\n",
" <td>mpfi</td>\n",
" <td>3.19</td>\n",
" <td>3.40</td>\n",
" <td>8.5</td>\n",
" <td>110.0</td>\n",
" <td>5500.0</td>\n",
" <td>19</td>\n",
" <td>25</td>\n",
" <td>15250.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>audi</td>\n",
" <td>gas</td>\n",
" <td>std</td>\n",
" <td>four</td>\n",
" <td>wagon</td>\n",
" <td>fwd</td>\n",
" <td>front</td>\n",
" <td>105.8</td>\n",
" <td>192.7</td>\n",
" <td>71.4</td>\n",
" <td>55.7</td>\n",
" <td>2954</td>\n",
" <td>ohc</td>\n",
" <td>five</td>\n",
" <td>136</td>\n",
" <td>mpfi</td>\n",
" <td>3.19</td>\n",
" <td>3.40</td>\n",
" <td>8.5</td>\n",
" <td>110.0</td>\n",
" <td>5500.0</td>\n",
" <td>19</td>\n",
" <td>25</td>\n",
" <td>18920.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" symboling normalized_losses make fuel_type aspiration num_doors \\\n",
"0 3 NaN alfa-romero gas std two \n",
"1 3 NaN alfa-romero gas std two \n",
"2 1 NaN alfa-romero gas std two \n",
"5 2 NaN audi gas std two \n",
"7 1 NaN audi gas std four \n",
"\n",
" body_style drive_wheels engine_location wheel_base length width \\\n",
"0 convertible rwd front 88.6 168.8 64.1 \n",
"1 convertible rwd front 88.6 168.8 64.1 \n",
"2 hatchback rwd front 94.5 171.2 65.5 \n",
"5 sedan fwd front 99.8 177.3 66.3 \n",
"7 wagon fwd front 105.8 192.7 71.4 \n",
"\n",
" height curb_weight engine_type num_cylinders engine_size fuel_system \\\n",
"0 48.8 2548 dohc four 130 mpfi \n",
"1 48.8 2548 dohc four 130 mpfi \n",
"2 52.4 2823 ohcv six 152 mpfi \n",
"5 53.1 2507 ohc five 136 mpfi \n",
"7 55.7 2954 ohc five 136 mpfi \n",
"\n",
" bore stroke compression_ratio horsepower peak_rpm city_mpg \\\n",
"0 3.47 2.68 9.0 111.0 5000.0 21 \n",
"1 3.47 2.68 9.0 111.0 5000.0 21 \n",
"2 2.68 3.47 9.0 154.0 5000.0 19 \n",
"5 3.19 3.40 8.5 110.0 5500.0 19 \n",
"7 3.19 3.40 8.5 110.0 5500.0 19 \n",
"\n",
" highway_mpg price \n",
"0 27 13495.0 \n",
"1 27 16500.0 \n",
"2 26 16500.0 \n",
"5 25 15250.0 \n",
"7 25 18920.0 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.isnull().any(axis=1)].head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# sklearn.preprocessing.Imputer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"ちなみにNULL値は次のコードで置き換えられる。\n",
"- http://lijiancheng0614.github.io/scikit-learn/modules/generated/sklearn.preprocessing.Imputer.html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 各列の大まかな分析をする"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>symboling</th>\n",
" <th>normalized_losses</th>\n",
" <th>wheel_base</th>\n",
" <th>length</th>\n",
" <th>width</th>\n",
" <th>height</th>\n",
" <th>curb_weight</th>\n",
" <th>engine_size</th>\n",
" <th>bore</th>\n",
" <th>stroke</th>\n",
" <th>compression_ratio</th>\n",
" <th>horsepower</th>\n",
" <th>peak_rpm</th>\n",
" <th>city_mpg</th>\n",
" <th>highway_mpg</th>\n",
" <th>price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>205.000000</td>\n",
" <td>164.000000</td>\n",
" <td>205.000000</td>\n",
" <td>205.000000</td>\n",
" <td>205.000000</td>\n",
" <td>205.000000</td>\n",
" <td>205.000000</td>\n",
" <td>205.000000</td>\n",
" <td>201.000000</td>\n",
" <td>201.000000</td>\n",
" <td>205.000000</td>\n",
" <td>203.000000</td>\n",
" <td>203.000000</td>\n",
" <td>205.000000</td>\n",
" <td>205.000000</td>\n",
" <td>201.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.834146</td>\n",
" <td>122.000000</td>\n",
" <td>98.756585</td>\n",
" <td>174.049268</td>\n",
" <td>65.907805</td>\n",
" <td>53.724878</td>\n",
" <td>2555.565854</td>\n",
" <td>126.907317</td>\n",
" <td>3.329751</td>\n",
" <td>3.255423</td>\n",
" <td>10.142537</td>\n",
" <td>104.256158</td>\n",
" <td>5125.369458</td>\n",
" <td>25.219512</td>\n",
" <td>30.751220</td>\n",
" <td>13207.129353</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>1.245307</td>\n",
" <td>35.442168</td>\n",
" <td>6.021776</td>\n",
" <td>12.337289</td>\n",
" <td>2.145204</td>\n",
" <td>2.443522</td>\n",
" <td>520.680204</td>\n",
" <td>41.642693</td>\n",
" <td>0.273539</td>\n",
" <td>0.316717</td>\n",
" <td>3.972040</td>\n",
" <td>39.714369</td>\n",
" <td>479.334560</td>\n",
" <td>6.542142</td>\n",
" <td>6.886443</td>\n",
" <td>7947.066342</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-2.000000</td>\n",
" <td>65.000000</td>\n",
" <td>86.600000</td>\n",
" <td>141.100000</td>\n",
" <td>60.300000</td>\n",
" <td>47.800000</td>\n",
" <td>1488.000000</td>\n",
" <td>61.000000</td>\n",
" <td>2.540000</td>\n",
" <td>2.070000</td>\n",
" <td>7.000000</td>\n",
" <td>48.000000</td>\n",
" <td>4150.000000</td>\n",
" <td>13.000000</td>\n",
" <td>16.000000</td>\n",
" <td>5118.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>0.000000</td>\n",
" <td>94.000000</td>\n",
" <td>94.500000</td>\n",
" <td>166.300000</td>\n",
" <td>64.100000</td>\n",
" <td>52.000000</td>\n",
" <td>2145.000000</td>\n",
" <td>97.000000</td>\n",
" <td>3.150000</td>\n",
" <td>3.110000</td>\n",
" <td>8.600000</td>\n",
" <td>70.000000</td>\n",
" <td>4800.000000</td>\n",
" <td>19.000000</td>\n",
" <td>25.000000</td>\n",
" <td>7775.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.000000</td>\n",
" <td>115.000000</td>\n",
" <td>97.000000</td>\n",
" <td>173.200000</td>\n",
" <td>65.500000</td>\n",
" <td>54.100000</td>\n",
" <td>2414.000000</td>\n",
" <td>120.000000</td>\n",
" <td>3.310000</td>\n",
" <td>3.290000</td>\n",
" <td>9.000000</td>\n",
" <td>95.000000</td>\n",
" <td>5200.000000</td>\n",
" <td>24.000000</td>\n",
" <td>30.000000</td>\n",
" <td>10295.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>2.000000</td>\n",
" <td>150.000000</td>\n",
" <td>102.400000</td>\n",
" <td>183.100000</td>\n",
" <td>66.900000</td>\n",
" <td>55.500000</td>\n",
" <td>2935.000000</td>\n",
" <td>141.000000</td>\n",
" <td>3.590000</td>\n",
" <td>3.410000</td>\n",
" <td>9.400000</td>\n",
" <td>116.000000</td>\n",
" <td>5500.000000</td>\n",
" <td>30.000000</td>\n",
" <td>34.000000</td>\n",
" <td>16500.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>3.000000</td>\n",
" <td>256.000000</td>\n",
" <td>120.900000</td>\n",
" <td>208.100000</td>\n",
" <td>72.300000</td>\n",
" <td>59.800000</td>\n",
" <td>4066.000000</td>\n",
" <td>326.000000</td>\n",
" <td>3.940000</td>\n",
" <td>4.170000</td>\n",
" <td>23.000000</td>\n",
" <td>288.000000</td>\n",
" <td>6600.000000</td>\n",
" <td>49.000000</td>\n",
" <td>54.000000</td>\n",
" <td>45400.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" symboling normalized_losses wheel_base length width \\\n",
"count 205.000000 164.000000 205.000000 205.000000 205.000000 \n",
"mean 0.834146 122.000000 98.756585 174.049268 65.907805 \n",
"std 1.245307 35.442168 6.021776 12.337289 2.145204 \n",
"min -2.000000 65.000000 86.600000 141.100000 60.300000 \n",
"25% 0.000000 94.000000 94.500000 166.300000 64.100000 \n",
"50% 1.000000 115.000000 97.000000 173.200000 65.500000 \n",
"75% 2.000000 150.000000 102.400000 183.100000 66.900000 \n",
"max 3.000000 256.000000 120.900000 208.100000 72.300000 \n",
"\n",
" height curb_weight engine_size bore stroke \\\n",
"count 205.000000 205.000000 205.000000 201.000000 201.000000 \n",
"mean 53.724878 2555.565854 126.907317 3.329751 3.255423 \n",
"std 2.443522 520.680204 41.642693 0.273539 0.316717 \n",
"min 47.800000 1488.000000 61.000000 2.540000 2.070000 \n",
"25% 52.000000 2145.000000 97.000000 3.150000 3.110000 \n",
"50% 54.100000 2414.000000 120.000000 3.310000 3.290000 \n",
"75% 55.500000 2935.000000 141.000000 3.590000 3.410000 \n",
"max 59.800000 4066.000000 326.000000 3.940000 4.170000 \n",
"\n",
" compression_ratio horsepower peak_rpm city_mpg highway_mpg \\\n",
"count 205.000000 203.000000 203.000000 205.000000 205.000000 \n",
"mean 10.142537 104.256158 5125.369458 25.219512 30.751220 \n",
"std 3.972040 39.714369 479.334560 6.542142 6.886443 \n",
"min 7.000000 48.000000 4150.000000 13.000000 16.000000 \n",
"25% 8.600000 70.000000 4800.000000 19.000000 25.000000 \n",
"50% 9.000000 95.000000 5200.000000 24.000000 30.000000 \n",
"75% 9.400000 116.000000 5500.000000 30.000000 34.000000 \n",
"max 23.000000 288.000000 6600.000000 49.000000 54.000000 \n",
"\n",
" price \n",
"count 201.000000 \n",
"mean 13207.129353 \n",
"std 7947.066342 \n",
"min 5118.000000 \n",
"25% 7775.000000 \n",
"50% 10295.000000 \n",
"75% 16500.000000 \n",
"max 45400.000000 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x1a1e694ba8>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 792x648 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"corr = df.corr()\n",
"# Generate a mask for the upper triangle\n",
"mask = np.zeros_like(corr, dtype=np.bool)\n",
"mask[np.triu_indices_from(mask)] = True\n",
"\n",
"# Set up the matplotlib figure\n",
"f, ax = plt.subplots(figsize=(11, 9))\n",
"\n",
"# Draw the heatmap with the mask and correct aspect ratio\n",
"sns.heatmap(corr, mask=mask, vmax=.8, center=0,\n",
" square=True, linewidths=.5, cbar_kws={\"shrink\": .5})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',\n",
" 'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury',\n",
" 'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche', 'renault',\n",
" 'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"make\"].unique()\n",
"df[\"make\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>make</th>\n",
" <th>make_encode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>alfa-romero</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>alfa-romero</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>alfa-romero</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>bmw</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" make make_encode\n",
"0 alfa-romero 0\n",
"1 alfa-romero 0\n",
"2 alfa-romero 0\n",
"3 audi 1\n",
"4 audi 1\n",
"5 audi 1\n",
"6 audi 1\n",
"7 audi 1\n",
"8 audi 1\n",
"9 audi 1\n",
"10 bmw 2"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"le = LabelEncoder()\n",
"df[\"make_encode\"] = le.fit_transform(df[\"make\"])\n",
"df[[\"make\", \"make_encode\"]].head(11)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# LabelBinarizer"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelBinarizer"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['convertible', 'hatchback', 'sedan', 'wagon', 'hardtop'],\n",
" dtype=object)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"body_style\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>convertible</th>\n",
" <th>hardtop</th>\n",
" <th>hatchback</th>\n",
" <th>sedan</th>\n",
" <th>wagon</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" convertible hardtop hatchback sedan wagon\n",
"0 1 0 0 0 0\n",
"1 1 0 0 0 0\n",
"2 0 0 1 0 0\n",
"3 0 0 0 1 0\n",
"4 0 0 0 1 0"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lb = LabelBinarizer()\n",
"lb_results = lb.fit_transform(df[\"body_style\"])\n",
"pd.DataFrame(lb_results, columns=lb.classes_).head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# OneHotEncoder"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"これは上で得た次のデータの[make_encode]に適応させます"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>make</th>\n",
" <th>make_encode</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>alfa-romero</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>alfa-romero</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>alfa-romero</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>audi</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>bmw</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" make make_encode\n",
"0 alfa-romero 0\n",
"1 alfa-romero 0\n",
"2 alfa-romero 0\n",
"3 audi 1\n",
"4 audi 1\n",
"5 audi 1\n",
"6 audi 1\n",
"7 audi 1\n",
"8 audi 1\n",
"9 audi 1\n",
"10 bmw 2"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[[\"make\", \"make_encode\"]].head(11)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"この場合は各列が[alfa-romero, audi, bmw]に対応したOneHotVectorを返します。"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[1., 0., 0.],\n",
" [1., 0., 0.],\n",
" [1., 0., 0.],\n",
" [0., 1., 0.],\n",
" [0., 1., 0.],\n",
" [0., 1., 0.],\n",
" [0., 1., 0.],\n",
" [0., 1., 0.],\n",
" [0., 1., 0.],\n",
" [0., 1., 0.],\n",
" [0., 0., 1.]])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oh = OneHotEncoder()\n",
"oh.fit_transform(df[\"make_encode\"].head(11).values.reshape(-1,1)).A"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MinMaxScaler"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"mms = MinMaxScaler()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.legend.Legend at 0x1a28765f28>"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 504x504 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.datasets import make_blobs\n",
"\n",
"X, Y = make_blobs(random_state=8,\n",
" n_samples=100, \n",
" n_features=2, \n",
" cluster_std=1.,\n",
" centers=1)\n",
"\n",
"plt.figure(figsize=(7, 7))\n",
"plt.scatter(X[:, 0], X[:, 1], marker='o', c=\"r\", s=25,label=\"original\")\n",
"\n",
"XX = mms.fit_transform(X)\n",
"plt.scatter(XX[:, 0], XX[:, 1], marker='o', c=\"b\", s=25, label=\"Normarization\")\n",
"\n",
"plt.legend()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"ss = StandardScaler()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.legend.Legend at 0x1a287c64e0>"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 504x504 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.datasets import make_blobs\n",
"\n",
"X, Y = make_blobs(random_state=8,\n",
" n_samples=100, \n",
" n_features=2, \n",
" cluster_std=1.,\n",
" centers=1)\n",
"\n",
"plt.figure(figsize=(7, 7))\n",
"plt.scatter(X[:, 0], X[:, 1], marker='o', c=\"r\", s=25,label=\"original\")\n",
"\n",
"XX = ss.fit_transform(X)\n",
"plt.scatter(XX[:, 0], XX[:, 1], marker='o', c=\"g\", s=25, label=\"standardization\")\n",
"\n",
"plt.legend()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 標準化(StandardScaler)と正規化(MinMaxScaler)を比較"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.legend.Legend at 0x1a289030f0>"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 504x504 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"X, Y = make_blobs(random_state=8,\n",
" n_samples=100, \n",
" n_features=2, \n",
" cluster_std=2.,\n",
" centers=1)\n",
"\n",
"plt.figure(figsize=(7, 7))\n",
"plt.scatter(X[:, 0], X[:, 1], marker='o', c=\"r\", s=25,label=\"original\")\n",
"\n",
"Y = mms.fit_transform(X)\n",
"plt.scatter(Y[:, 0], Y[:, 1], marker='o', c=\"b\", s=25, label=\"Normarization\")\n",
"\n",
"XX = ss.fit_transform(X)\n",
"plt.scatter(XX[:, 0], XX[:, 1], marker='o', c=\"green\", s=25, label=\"standardization\")\n",
"\n",
"plt.legend()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:py35]",
"language": "python",
"name": "conda-env-py35-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment