sunkay/data_preprocessing.ipynb

## data_preprocessing.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "data_preprocessing.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyNK3K3vMMn+sEHn17tTX8Bt",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/sunkay/ec63b73b4e1f51e4d19bd0fc51977655/data_preprocessing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EDULpUSqt2BJ"
      },
      "source": [
        "Import Libraries\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mA0Q1B8tt7h7"
      },
      "source": [
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd "
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zS6_Hw5sui_4"
      },
      "source": [
        "Import Datasets"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nfg8l2Hgul62"
      },
      "source": [
        "dataset = pd.read_csv('Data.csv')\n",
        "X = dataset.iloc[:, :-1].values  \n",
        "y = dataset.iloc[:, -1].values"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "FH_eLL6g6fPK"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9IBwk8Lk5NzJ"
      },
      "source": [
        "print(X)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "AKiefF4w6zbT"
      },
      "source": [
        "print(y)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "VHjtaHoF7fvx"
      },
      "source": [
        "# handle missing values\n",
        "from sklearn.impute import SimpleImputer\n",
        "imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
        "imputer.fit(X[:, 1:3])\n",
        "X[:, 1:3] = imputer.transform(X[:, 1:3])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Fzz9j-4vAOYz"
      },
      "source": [
        "print(X)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "uTGc80KLRu7o"
      },
      "source": [
        "Encoding categorical data\n",
        "Turn strings or categores into numbers. \n",
        "One Hot Encoding. Turn one column into multiple columns. Converts strings into binary vectors"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZfcetAbbSCSV"
      },
      "source": [
        "# Encoding categorical data\n",
        "from sklearn.compose import ColumnTransformer\n",
        "from sklearn.preprocessing import OneHotEncoder\n",
        "ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')\n",
        "X = np.array(ct.fit_transform(X))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Wkm0GsPNSACz"
      },
      "source": [
        "print(X)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HhbToG1YmQ1W"
      },
      "source": [
        "Encoding the Independent Variable\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "J9zvogrGmSgl"
      },
      "source": [
        "from sklearn.preprocessing import LabelEncoder\n",
        "le = LabelEncoder()\n",
        "y = le.fit_transform(y)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lgUHND6dmiA3"
      },
      "source": [
        "print(y)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "eWp92IYUpSrz"
      },
      "source": [
        "Split Test & Train"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "M1ehH-8hmjeP"
      },
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "X_train, X_test, y_train, y_test = \\\n",
        "        train_test_split(X, y, test_size = 0.2, random_state = 1)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "asFcEL_Oo7IN"
      },
      "source": [
        "print(X_train)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "IYur8BzGpEHo"
      },
      "source": [
        "print(X_test)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gEzICbOCpFou"
      },
      "source": [
        "print(y_train)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yZbw_rl-pH0U"
      },
      "source": [
        "print(y_test)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cn7mWSUepPVf"
      },
      "source": [
        "Feature Scaling. Apply standardization instead of normalization. \n",
        "\n",
        "Do not apply standardization on dummy variables or categorically encoded features"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Hnv2t6ESpQub"
      },
      "source": [
        "from sklearn.preprocessing import StandardScaler\n",
        "sc = StandardScaler()\n",
        "X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])\n",
        "X_test[:, 3:] = sc.transform(X_test[:, 3:])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0V2qNKdqpRUh"
      },
      "source": [
        "print(X_train)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_GtEV_OZFCFp"
      },
      "source": [
        "print(X_test)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vA5wnG9aFE4S"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "data_preprocessing.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyNK3K3vMMn+sEHn17tTX8Bt",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/sunkay/ec63b73b4e1f51e4d19bd0fc51977655/data_preprocessing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "EDULpUSqt2BJ"
	},
	"source": [
	"Import Libraries\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "mA0Q1B8tt7h7"
	},
	"source": [
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import pandas as pd "
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "zS6_Hw5sui_4"
	},
	"source": [
	"Import Datasets"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "nfg8l2Hgul62"
	},
	"source": [
	"dataset = pd.read_csv('Data.csv')\n",
	"X = dataset.iloc[:, :-1].values \n",
	"y = dataset.iloc[:, -1].values"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "FH_eLL6g6fPK"
	},
	"source": [
	"from google.colab import drive\n",
	"drive.mount('/content/drive')"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "9IBwk8Lk5NzJ"
	},
	"source": [
	"print(X)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "AKiefF4w6zbT"
	},
	"source": [
	"print(y)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "VHjtaHoF7fvx"
	},
	"source": [
	"# handle missing values\n",
	"from sklearn.impute import SimpleImputer\n",
	"imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
	"imputer.fit(X[:, 1:3])\n",
	"X[:, 1:3] = imputer.transform(X[:, 1:3])"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Fzz9j-4vAOYz"
	},
	"source": [
	"print(X)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "uTGc80KLRu7o"
	},
	"source": [
	"Encoding categorical data\n",
	"Turn strings or categores into numbers. \n",
	"One Hot Encoding. Turn one column into multiple columns. Converts strings into binary vectors"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "ZfcetAbbSCSV"
	},
	"source": [
	"# Encoding categorical data\n",
	"from sklearn.compose import ColumnTransformer\n",
	"from sklearn.preprocessing import OneHotEncoder\n",
	"ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')\n",
	"X = np.array(ct.fit_transform(X))"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Wkm0GsPNSACz"
	},
	"source": [
	"print(X)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "HhbToG1YmQ1W"
	},
	"source": [
	"Encoding the Independent Variable\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "J9zvogrGmSgl"
	},
	"source": [
	"from sklearn.preprocessing import LabelEncoder\n",
	"le = LabelEncoder()\n",
	"y = le.fit_transform(y)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "lgUHND6dmiA3"
	},
	"source": [
	"print(y)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "eWp92IYUpSrz"
	},
	"source": [
	"Split Test & Train"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "M1ehH-8hmjeP"
	},
	"source": [
	"from sklearn.model_selection import train_test_split\n",
	"X_train, X_test, y_train, y_test = \\\n",
	" train_test_split(X, y, test_size = 0.2, random_state = 1)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "asFcEL_Oo7IN"
	},
	"source": [
	"print(X_train)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "IYur8BzGpEHo"
	},
	"source": [
	"print(X_test)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "gEzICbOCpFou"
	},
	"source": [
	"print(y_train)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "yZbw_rl-pH0U"
	},
	"source": [
	"print(y_test)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "cn7mWSUepPVf"
	},
	"source": [
	"Feature Scaling. Apply standardization instead of normalization. \n",
	"\n",
	"Do not apply standardization on dummy variables or categorically encoded features"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Hnv2t6ESpQub"
	},
	"source": [
	"from sklearn.preprocessing import StandardScaler\n",
	"sc = StandardScaler()\n",
	"X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])\n",
	"X_test[:, 3:] = sc.transform(X_test[:, 3:])"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "0V2qNKdqpRUh"
	},
	"source": [
	"print(X_train)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "_GtEV_OZFCFp"
	},
	"source": [
	"print(X_test)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "vA5wnG9aFE4S"
	},
	"source": [
	""
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}