Created
October 15, 2020 16:12
-
-
Save sunkay/ec63b73b4e1f51e4d19bd0fc51977655 to your computer and use it in GitHub Desktop.
data_preprocessing.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "data_preprocessing.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyNK3K3vMMn+sEHn17tTX8Bt", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/sunkay/ec63b73b4e1f51e4d19bd0fc51977655/data_preprocessing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "EDULpUSqt2BJ" | |
}, | |
"source": [ | |
"Import Libraries\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "mA0Q1B8tt7h7" | |
}, | |
"source": [ | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import pandas as pd " | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "zS6_Hw5sui_4" | |
}, | |
"source": [ | |
"Import Datasets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nfg8l2Hgul62" | |
}, | |
"source": [ | |
"dataset = pd.read_csv('Data.csv')\n", | |
"X = dataset.iloc[:, :-1].values \n", | |
"y = dataset.iloc[:, -1].values" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "FH_eLL6g6fPK" | |
}, | |
"source": [ | |
"from google.colab import drive\n", | |
"drive.mount('/content/drive')" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "9IBwk8Lk5NzJ" | |
}, | |
"source": [ | |
"print(X)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "AKiefF4w6zbT" | |
}, | |
"source": [ | |
"print(y)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "VHjtaHoF7fvx" | |
}, | |
"source": [ | |
"# handle missing values\n", | |
"from sklearn.impute import SimpleImputer\n", | |
"imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n", | |
"imputer.fit(X[:, 1:3])\n", | |
"X[:, 1:3] = imputer.transform(X[:, 1:3])" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Fzz9j-4vAOYz" | |
}, | |
"source": [ | |
"print(X)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "uTGc80KLRu7o" | |
}, | |
"source": [ | |
"Encoding categorical data\n", | |
"Turn strings or categores into numbers. \n", | |
"One Hot Encoding. Turn one column into multiple columns. Converts strings into binary vectors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ZfcetAbbSCSV" | |
}, | |
"source": [ | |
"# Encoding categorical data\n", | |
"from sklearn.compose import ColumnTransformer\n", | |
"from sklearn.preprocessing import OneHotEncoder\n", | |
"ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')\n", | |
"X = np.array(ct.fit_transform(X))" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Wkm0GsPNSACz" | |
}, | |
"source": [ | |
"print(X)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "HhbToG1YmQ1W" | |
}, | |
"source": [ | |
"Encoding the Independent Variable\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "J9zvogrGmSgl" | |
}, | |
"source": [ | |
"from sklearn.preprocessing import LabelEncoder\n", | |
"le = LabelEncoder()\n", | |
"y = le.fit_transform(y)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "lgUHND6dmiA3" | |
}, | |
"source": [ | |
"print(y)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "eWp92IYUpSrz" | |
}, | |
"source": [ | |
"Split Test & Train" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "M1ehH-8hmjeP" | |
}, | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n", | |
"X_train, X_test, y_train, y_test = \\\n", | |
" train_test_split(X, y, test_size = 0.2, random_state = 1)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "asFcEL_Oo7IN" | |
}, | |
"source": [ | |
"print(X_train)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "IYur8BzGpEHo" | |
}, | |
"source": [ | |
"print(X_test)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gEzICbOCpFou" | |
}, | |
"source": [ | |
"print(y_train)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "yZbw_rl-pH0U" | |
}, | |
"source": [ | |
"print(y_test)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "cn7mWSUepPVf" | |
}, | |
"source": [ | |
"Feature Scaling. Apply standardization instead of normalization. \n", | |
"\n", | |
"Do not apply standardization on dummy variables or categorically encoded features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Hnv2t6ESpQub" | |
}, | |
"source": [ | |
"from sklearn.preprocessing import StandardScaler\n", | |
"sc = StandardScaler()\n", | |
"X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])\n", | |
"X_test[:, 3:] = sc.transform(X_test[:, 3:])" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0V2qNKdqpRUh" | |
}, | |
"source": [ | |
"print(X_train)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_GtEV_OZFCFp" | |
}, | |
"source": [ | |
"print(X_test)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vA5wnG9aFE4S" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment