whan0623/part5_.ipynb Secret

## part5_.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Part5_데이터사전처리.ipynb",
      "provenance": [],
      "authorship_tag": "ABX9TyPA3kwbM8SdjXfylNilsnV9",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/whan0623/8bfe3f9347c2758d958bf88a030d3e1d/part5_.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "eR72i9o5uA17"
      },
      "source": [
        "# 누락 데이터 처리 #"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "yzJCA6OcxStS"
      },
      "source": [
        "## 누락 데이터 확인 ##"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qZiJgHBPt6NE",
        "outputId": "c6ec6a8c-522d-4589-8449-8eb36d0d6bb7"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import seaborn as sns\n",
        "\n",
        "# titanic 데이터셋 가져오기\n",
        "df = sns.load_dataset('titanic')\n",
        "\n",
        "# deck 열의 NaN 개수 계산하기\n",
        "nan_deck = df['deck'].value_counts(dropna=False) \n",
        "print(nan_deck)\n",
        "\n",
        "# isnull() 메서드로 누락 데이터 찾기\n",
        "print(df.head().isnull())\n",
        "\n",
        "# notnull() 메서드로 누락 데이터 찾기\n",
        "print(df.head().notnull())\n",
        "\n",
        "# isnull() 메서드로 누락 데이터 개수 구하기\n",
        "print(df.head().isnull().sum(axis=0))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "NaN    688\n",
            "C       59\n",
            "B       47\n",
            "D       33\n",
            "E       32\n",
            "A       15\n",
            "F       13\n",
            "G        4\n",
            "Name: deck, dtype: int64\n",
            "   survived  pclass    sex    age  ...   deck  embark_town  alive  alone\n",
            "0     False   False  False  False  ...   True        False  False  False\n",
            "1     False   False  False  False  ...  False        False  False  False\n",
            "2     False   False  False  False  ...   True        False  False  False\n",
            "3     False   False  False  False  ...  False        False  False  False\n",
            "4     False   False  False  False  ...   True        False  False  False\n",
            "\n",
            "[5 rows x 15 columns]\n",
            "   survived  pclass   sex   age  ...   deck  embark_town  alive  alone\n",
            "0      True    True  True  True  ...  False         True   True   True\n",
            "1      True    True  True  True  ...   True         True   True   True\n",
            "2      True    True  True  True  ...  False         True   True   True\n",
            "3      True    True  True  True  ...   True         True   True   True\n",
            "4      True    True  True  True  ...  False         True   True   True\n",
            "\n",
            "[5 rows x 15 columns]\n",
            "survived       0\n",
            "pclass         0\n",
            "sex            0\n",
            "age            0\n",
            "sibsp          0\n",
            "parch          0\n",
            "fare           0\n",
            "embarked       0\n",
            "class          0\n",
            "who            0\n",
            "adult_male     0\n",
            "deck           3\n",
            "embark_town    0\n",
            "alive          0\n",
            "alone          0\n",
            "dtype: int64\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "eh6490EFxNfj"
      },
      "source": [
        "## 누락 데이터 제거"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "UDlzXLmNxXzq",
        "outputId": "60632714-2c58-4114-cb9e-29e7fc40f9b4"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import seaborn as sns\n",
        "\n",
        "# titanic 데이터셋 가져오기\n",
        "df = sns.load_dataset('titanic')\n",
        "\n",
        "# for 반복문으로 각 열의 NaN 개수 계산하기\n",
        "missing_df = df.isnull()\n",
        "for col in missing_df.columns:\n",
        "    missing_count = missing_df[col].value_counts()    # 각 열의 NaN 개수 파악\n",
        "\n",
        "    try: \n",
        "        print(col, ': ', missing_count[True])   # NaN 값이 있으면 개수를 출력\n",
        "    except:\n",
        "        print(col, ': ', 0)                     # NaN 값이 없으면 0개 출력\n",
        "        \n",
        "# NaN 값이 500개 이상인 열을 모두 삭제 - deck 열(891개 중 688개의 NaN 값)\n",
        "df_thresh = df.dropna(axis=1, thresh=500)  \n",
        "print(df_thresh.columns)\n",
        "\n",
        "# age 열에 나이 데이터가 없는 모든 행을 삭제 - age 열(891개 중 177개의 NaN 값)\n",
        "df_age = df.dropna(subset=['age'], how='any', axis=0)  \n",
        "print(len(df_age))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "survived :  0\n",
            "pclass :  0\n",
            "sex :  0\n",
            "age :  177\n",
            "sibsp :  0\n",
            "parch :  0\n",
            "fare :  0\n",
            "embarked :  2\n",
            "class :  0\n",
            "who :  0\n",
            "adult_male :  0\n",
            "deck :  688\n",
            "embark_town :  2\n",
            "alive :  0\n",
            "alone :  0\n",
            "Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',\n",
            "       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',\n",
            "       'alone'],\n",
            "      dtype='object')\n",
            "714\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TFTkupxBE-Ln"
      },
      "source": [
        "## 누락 데이터 치환 ##"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5AKKsflDFym9",
        "outputId": "1e485ac8-287c-4a5a-f586-d0de6a2a57f5"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import seaborn as sns\n",
        "\n",
        "# titanic 데이터셋 가져오기\n",
        "df = sns.load_dataset('titanic')\n",
        "\n",
        "# age 열의 첫 10개 데이터 출력 (5 행에 NaN 값)\n",
        "print(df['age'].head(10))\n",
        "print('\\n')\n",
        "\n",
        "# age 열의 NaN값을 다른 나이 데이터의 평균으로 변경하기\n",
        "mean_age = df['age'].mean(axis=0)   # age 열의 평균 계산 (NaN 값 제외)\n",
        "print(\"평균:\", mean_age)\n",
        "print('\\n')\n",
        "df['age'].fillna(mean_age, inplace=True)\n",
        "\n",
        "# age 열의 첫 10개 데이터 출력 (5 행에 NaN 값이 평균으로 대체)\n",
        "print(df['age'].head(10))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "0    22.0\n",
            "1    38.0\n",
            "2    26.0\n",
            "3    35.0\n",
            "4    35.0\n",
            "5     NaN\n",
            "6    54.0\n",
            "7     2.0\n",
            "8    27.0\n",
            "9    14.0\n",
            "Name: age, dtype: float64\n",
            "\n",
            "\n",
            "평균: 29.69911764705882\n",
            "\n",
            "\n",
            "0    22.000000\n",
            "1    38.000000\n",
            "2    26.000000\n",
            "3    35.000000\n",
            "4    35.000000\n",
            "5    29.699118\n",
            "6    54.000000\n",
            "7     2.000000\n",
            "8    27.000000\n",
            "9    14.000000\n",
            "Name: age, dtype: float64\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2ExVxPGoGW89",
        "outputId": "3c315388-00a6-4f67-d597-2cf496644d23"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import seaborn as sns\n",
        "\n",
        "# titanic 데이터셋 가져오기\n",
        "df = sns.load_dataset('titanic')\n",
        "\n",
        "# embark_town 열의 829행의 NaN 데이터 출력\n",
        "print(df['embark_town'][825:830])\n",
        "print('\\n')\n",
        "\n",
        "# embark_town 열의 NaN값을 승선도시 중에서 가장 많이 출현한 값으로 치환하기\n",
        "most_freq = df['embark_town'].value_counts(dropna=True).idxmax()   \n",
        "print(most_freq)\n",
        "print('\\n')\n",
        "\n",
        "df['embark_town'].fillna(most_freq, inplace=True)\n",
        "\n",
        "# embark_town 열 829행의 NaN 데이터 출력 (NaN 값이 most_freq 값으로 대체)\n",
        "print(df['embark_town'][825:830])"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "825     Queenstown\n",
            "826    Southampton\n",
            "827      Cherbourg\n",
            "828     Queenstown\n",
            "829            NaN\n",
            "Name: embark_town, dtype: object\n",
            "\n",
            "\n",
            "Southampton\n",
            "\n",
            "\n",
            "825     Queenstown\n",
            "826    Southampton\n",
            "827      Cherbourg\n",
            "828     Queenstown\n",
            "829    Southampton\n",
            "Name: embark_town, dtype: object\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YxvQXd4EJY4S",
        "outputId": "d071d195-a68e-4c3e-968f-31c7c766340a"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import seaborn as sns\n",
        "\n",
        "# titanic 데이터셋 가져오기\n",
        "df = sns.load_dataset('titanic')\n",
        "\n",
        "# embark_town 열의 829행의 NaN 데이터 출력\n",
        "print(df['embark_town'][825:830])\n",
        "print('\\n')\n",
        "\n",
        "# embark_town 열의 NaN값을 바로 앞에 있는 828행의 값으로 변경하기\n",
        "df['embark_town'].fillna(method='ffill', inplace=True)\n",
        "print(df['embark_town'][825:830])"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "825     Queenstown\n",
            "826    Southampton\n",
            "827      Cherbourg\n",
            "828     Queenstown\n",
            "829            NaN\n",
            "Name: embark_town, dtype: object\n",
            "\n",
            "\n",
            "825     Queenstown\n",
            "826    Southampton\n",
            "827      Cherbourg\n",
            "828     Queenstown\n",
            "829     Queenstown\n",
            "Name: embark_town, dtype: object\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wUcgSupCJVhC"
      },
      "source": [
        "# 중복 데이터 처리 #\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bK0Ac5cxJk0i"
      },
      "source": [
        "## 중복 데이터 확인 ##"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Q0jl4e6WJhM0",
        "outputId": "dcc49fd2-95ea-4085-a65b-e960055358e3"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# 중복 데이터를 갖는 데이터프레임 만들기\n",
        "df = pd.DataFrame({'c1':['a', 'a', 'b', 'a', 'b'],\n",
        "                  'c2':[1, 1, 1, 2, 2],\n",
        "                  'c3':[1, 1, 2, 2, 2]})\n",
        "print(df)\n",
        "print('\\n')\n",
        "\n",
        "# 데이터프레임 전체 행 데이터 중에서 중복값 찾기\n",
        "df_dup = df.duplicated()\n",
        "print(df_dup)\n",
        "print('\\n')\n",
        "\n",
        "# 데이터프레임의 특정 열 데이터에서 중복값 찾기\n",
        "col_dup = df['c2'].duplicated()\n",
        "print(col_dup)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  c1  c2  c3\n",
            "0  a   1   1\n",
            "1  a   1   1\n",
            "2  b   1   2\n",
            "3  a   2   2\n",
            "4  b   2   2\n",
            "\n",
            "\n",
            "0    False\n",
            "1     True\n",
            "2    False\n",
            "3    False\n",
            "4    False\n",
            "dtype: bool\n",
            "\n",
            "\n",
            "0    False\n",
            "1     True\n",
            "2     True\n",
            "3    False\n",
            "4     True\n",
            "Name: c2, dtype: bool\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lAe-KXEBK4rj"
      },
      "source": [
        "## 중복 데이터 제거"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wzErfrQJK6dr",
        "outputId": "5a93e77d-4d75-4a87-9b12-004521f45193"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# 중복 데이터를 갖는 데이터프레임 만들기\n",
        "df = pd.DataFrame({'c1':['a', 'a', 'b', 'a', 'b'],\n",
        "                  'c2':[1, 1, 1, 2, 2],\n",
        "                  'c3':[1, 1, 2, 2, 2]})\n",
        "print(df)\n",
        "print('\\n')\n",
        "\n",
        "# 데이터프레임에서 중복 행을 제거\n",
        "df2 = df.drop_duplicates()\n",
        "print(df2)\n",
        "print('\\n')\n",
        "\n",
        "# c2, c3열을 기준으로 중복 행을 제거\n",
        "df3 = df.drop_duplicates(subset=['c2', 'c3'])\n",
        "print(df3)\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  c1  c2  c3\n",
            "0  a   1   1\n",
            "1  a   1   1\n",
            "2  b   1   2\n",
            "3  a   2   2\n",
            "4  b   2   2\n",
            "\n",
            "\n",
            "  c1  c2  c3\n",
            "0  a   1   1\n",
            "2  b   1   2\n",
            "3  a   2   2\n",
            "4  b   2   2\n",
            "\n",
            "\n",
            "  c1  c2  c3\n",
            "0  a   1   1\n",
            "2  b   1   2\n",
            "3  a   2   2\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wjd1TqMWLIXT"
      },
      "source": [
        "# 데이터 표준화 #"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fJ478ngjLOnk",
        "outputId": "55d3539f-8592-483d-b8fd-c298dfadc328"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# read_csv() 함수로 df 생성\n",
        "df = pd.read_csv('./auto-mpg.csv', header=None)\n",
        "\n",
        "# 열 이름을 지정\n",
        "df.columns = ['mpg','cylinders','displacement','horsepower','weight',\n",
        "              'acceleration','model year','origin','name'] \n",
        "print(df.head(3))    \n",
        "print('\\n')\n",
        "\n",
        "# mpg(mile per gallon)를 kpl(kilometer per liter)로 변환 (mpg_to_kpl = 0.425)\n",
        "mpg_to_kpl = 1.60934 / 3.78541\n",
        "\n",
        "# mpg 열에 0.425를 곱한 결과를 새로운 열(kpl)에 추가\n",
        "df['kpl'] = df['mpg'] * mpg_to_kpl\n",
        "print(df.head(3))    \n",
        "print('\\n')\n",
        "\n",
        "# kpl 열을 소수점 아래 둘째 자리에서 반올림 \n",
        "df['kpl'] = df['kpl'].round(2)\n",
        "print(df.head(3))     "
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "    mpg  cylinders  displacement  ... model year  origin                       name\n",
            "0  18.0          8         307.0  ...         70       1  chevrolet chevelle malibu\n",
            "1  15.0          8         350.0  ...         70       1          buick skylark 320\n",
            "2  18.0          8         318.0  ...         70       1         plymouth satellite\n",
            "\n",
            "[3 rows x 9 columns]\n",
            "\n",
            "\n",
            "    mpg  cylinders  displacement  ... origin                       name       kpl\n",
            "0  18.0          8         307.0  ...      1  chevrolet chevelle malibu  7.652571\n",
            "1  15.0          8         350.0  ...      1          buick skylark 320  6.377143\n",
            "2  18.0          8         318.0  ...      1         plymouth satellite  7.652571\n",
            "\n",
            "[3 rows x 10 columns]\n",
            "\n",
            "\n",
            "    mpg  cylinders  displacement  ... origin                       name   kpl\n",
            "0  18.0          8         307.0  ...      1  chevrolet chevelle malibu  7.65\n",
            "1  15.0          8         350.0  ...      1          buick skylark 320  6.38\n",
            "2  18.0          8         318.0  ...      1         plymouth satellite  7.65\n",
            "\n",
            "[3 rows x 10 columns]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HvNjuixbL308"
      },
      "source": [
        "## 자료형 변환 ##"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gdxPiJVXL5b2",
        "outputId": "327f0841-76ad-409c-a061-6c4f4118ac96"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# read_csv() 함수로 df 생성\n",
        "df = pd.read_csv('./auto-mpg.csv', header=None)\n",
        "\n",
        "# 열 이름을 지정\n",
        "df.columns = ['mpg','cylinders','displacement','horsepower','weight',\n",
        "              'acceleration','model year','origin','name'] \n",
        "\n",
        "# 각 열의 자료형 확인\n",
        "print(df.dtypes)   \n",
        "print('\\n')\n",
        "\n",
        "# horsepower 열의 고유값 확인\n",
        "print(df['horsepower'].unique())\n",
        "print('\\n')\n",
        "\n",
        "# 누락 데이터('?') 삭제 \n",
        "import numpy as np\n",
        "df['horsepower'].replace('?', np.nan, inplace=True)      # '?'을 np.nan으로 변경\n",
        "df.dropna(subset=['horsepower'], axis=0, inplace=True)   # 누락데이터 행을 삭제\n",
        "df['horsepower'] = df['horsepower'].astype('float')      # 문자열을 실수형으로 변환\n",
        "\n",
        "# horsepower 열의 자료형 확인\n",
        "print(df['horsepower'].dtypes)  \n",
        "print('\\n')\n",
        "\n",
        "# origin 열의 고유값 확인\n",
        "print(df['origin'].unique())\n",
        "\n",
        "# 정수형 데이터를 문자형 데이터로 변환 \n",
        "df['origin'].replace({1:'USA', 2:'EU', 3:'JAPAN'}, inplace=True)\n",
        "\n",
        "# origin 열의 고유값과 자료형 확인\n",
        "print(df['origin'].unique())\n",
        "print(df['origin'].dtypes) \n",
        "print('\\n')\n",
        "\n",
        "# origin 열의 문자열 자료형을 범주형으로 변환\n",
        "df['origin'] = df['origin'].astype('category')     \n",
        "print(df['origin'].dtypes) \n",
        "\n",
        "# 범주형을 문자열로 다시 변환\n",
        "df['origin'] = df['origin'].astype('str')     \n",
        "print(df['origin'].dtypes)\n",
        "\n",
        "# model year 열의 정수형을 범주형으로 변환\n",
        "print(df['model year'].sample(3))\n",
        "df['model year'] = df['model year'].astype('category') \n",
        "print(df['model year'].sample(3)) "
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "mpg             float64\n",
            "cylinders         int64\n",
            "displacement    float64\n",
            "horsepower       object\n",
            "weight          float64\n",
            "acceleration    float64\n",
            "model year        int64\n",
            "origin            int64\n",
            "name             object\n",
            "dtype: object\n",
            "\n",
            "\n",
            "['130.0' '165.0' '150.0' '140.0' '198.0' '220.0' '215.0' '225.0' '190.0'\n",
            " '170.0' '160.0' '95.00' '97.00' '85.00' '88.00' '46.00' '87.00' '90.00'\n",
            " '113.0' '200.0' '210.0' '193.0' '?' '100.0' '105.0' '175.0' '153.0'\n",
            " '180.0' '110.0' '72.00' '86.00' '70.00' '76.00' '65.00' '69.00' '60.00'\n",
            " '80.00' '54.00' '208.0' '155.0' '112.0' '92.00' '145.0' '137.0' '158.0'\n",
            " '167.0' '94.00' '107.0' '230.0' '49.00' '75.00' '91.00' '122.0' '67.00'\n",
            " '83.00' '78.00' '52.00' '61.00' '93.00' '148.0' '129.0' '96.00' '71.00'\n",
            " '98.00' '115.0' '53.00' '81.00' '79.00' '120.0' '152.0' '102.0' '108.0'\n",
            " '68.00' '58.00' '149.0' '89.00' '63.00' '48.00' '66.00' '139.0' '103.0'\n",
            " '125.0' '133.0' '138.0' '135.0' '142.0' '77.00' '62.00' '132.0' '84.00'\n",
            " '64.00' '74.00' '116.0' '82.00']\n",
            "\n",
            "\n",
            "float64\n",
            "\n",
            "\n",
            "[1 3 2]\n",
            "['USA' 'JAPAN' 'EU']\n",
            "object\n",
            "\n",
            "\n",
            "category\n",
            "object\n",
            "321    80\n",
            "272    78\n",
            "198    76\n",
            "Name: model year, dtype: int64\n",
            "195    76\n",
            "11     70\n",
            "225    77\n",
            "Name: model year, dtype: category\n",
            "Categories (13, int64): [70, 71, 72, 73, ..., 79, 80, 81, 82]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AF9tmT3NOCYq"
      },
      "source": [
        "# 범주형(카테고리) 데이터 처리 #"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FhXemBhTOIHh"
      },
      "source": [
        "## 구간 분할 ##\n",
        "- 연속 범위를 분할하여 값 할당\n",
        "- hp.histogram(df['horsepower'], bins=3) \n",
        "      count, bin_dividers = np.histogram(df['horsepower'], bins=3)\n",
        "  - 최소값~최대값 사이를 같은 폭으로 3개로 나눔\n",
        "  - (최대값(230) - 최소값(46)) / 3 = 61.3333\n",
        "  - [46, 107.3333, 168.6666, 230]  \n",
        "- pd.cut() 함수로 각 데이터를 3개의 bin에 할당\n",
        "      df['hp_bin'] = pd.cut(x=df['horsepower'],     \n",
        "                            bins=[46, 107.3333, 168.6666, 230],\n",
        "                            labels=['저출력', '보통출력', '고출력'],\n",
        "                            include_lowest=True)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7w6_LVvpOFsy",
        "outputId": "7ed84183-f6e7-4f3e-bb6f-3e400054d144"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "# read_csv() 함수로 df 생성\n",
        "df = pd.read_csv('./auto-mpg.csv', header=None)\n",
        "\n",
        "# 열 이름을 지정\n",
        "df.columns = ['mpg','cylinders','displacement','horsepower','weight',\n",
        "              'acceleration','model year','origin','name'] \n",
        "\n",
        "# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환\n",
        "df['horsepower'].replace('?', np.nan, inplace=True)      # '?'을 np.nan으로 변경\n",
        "df.dropna(subset=['horsepower'], axis=0, inplace=True)   # 누락데이터 행을 삭제\n",
        "df['horsepower'] = df['horsepower'].astype('float')      # 문자열을 실수형으로 변환\n",
        "\n",
        "# np.histogram 함수로 3개의 bin으로 나누는 경계 값의 리스트 구하기\n",
        "count, bin_dividers = np.histogram(df['horsepower'], bins=3)\n",
        "print(bin_dividers) \n",
        "\n",
        "# 3개의 bin에 이름 지정\n",
        "bin_names = ['저출력', '보통출력', '고출력']\n",
        "\n",
        "# pd.cut 함수로 각 데이터를 3개의 bin에 할당\n",
        "df['hp_bin'] = pd.cut(x=df['horsepower'],     # 데이터 배열\n",
        "                      bins=bin_dividers,      # 경계 값 리스트\n",
        "                      labels=bin_names,       # bin 이름\n",
        "                      include_lowest=True)    # 첫 경계값 포함 \n",
        "\n",
        "# horsepower 열, hp_bin 열의 첫 15행을 출력\n",
        "print(df[['horsepower', 'hp_bin']].head(15))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[ 46.         107.33333333 168.66666667 230.        ]\n",
            "    horsepower hp_bin\n",
            "0        130.0   보통출력\n",
            "1        165.0   보통출력\n",
            "2        150.0   보통출력\n",
            "3        150.0   보통출력\n",
            "4        140.0   보통출력\n",
            "5        198.0    고출력\n",
            "6        220.0    고출력\n",
            "7        215.0    고출력\n",
            "8        225.0    고출력\n",
            "9        190.0    고출력\n",
            "10       170.0    고출력\n",
            "11       160.0   보통출력\n",
            "12       150.0   보통출력\n",
            "13       225.0    고출력\n",
            "14        95.0    저출력\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cZF7rrioSbxZ"
      },
      "source": [
        "## 더미 변수 ##\n",
        "- 특정 단어를 숫자로 표현\n",
        "- 원핫인코딩(one-hot-encoding) 형태로 출력됨\n",
        "- hp_bin 열의 범주형 데이터를 더미 변수로 변환\n",
        "      horsepower_dummies = pd.get_dummies(df['hp_bin'])"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "eVwRMszxOHIw",
        "outputId": "941f11df-957f-4ea8-ddee-861e74c57c54"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "# read_csv() 함수로 df 생성\n",
        "df = pd.read_csv('./auto-mpg.csv', header=None)\n",
        "\n",
        "# 열 이름을 지정\n",
        "df.columns = ['mpg','cylinders','displacement','horsepower','weight',\n",
        "              'acceleration','model year','origin','name'] \n",
        "\n",
        "# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환\n",
        "df['horsepower'].replace('?', np.nan, inplace=True)      # '?'을 np.nan으로 변경\n",
        "df.dropna(subset=['horsepower'], axis=0, inplace=True)   # 누락데이터 행을 삭제\n",
        "df['horsepower'] = df['horsepower'].astype('float')      # 문자열을 실수형으로 변환\n",
        "\n",
        "# np.histogram 으로 3개의 bin으로 나누는 경계 값의 리스트 구하기\n",
        "count, bin_dividers = np.histogram(df['horsepower'], bins=3)\n",
        "\n",
        "# 3개의 bin에 이름 지정\n",
        "bin_names = ['저출력', '보통출력', '고출력']\n",
        "\n",
        "# pd.cut 으로 각 데이터를 3개의 bin에 할당\n",
        "df['hp_bin'] = pd.cut(x=df['horsepower'],     # 데이터 배열\n",
        "                      bins=bin_dividers,      # 경계 값 리스트\n",
        "                      labels=bin_names,       # bin 이름\n",
        "                      include_lowest=True)    # 첫 경계값 포함\n",
        "\n",
        "# hp_bin 열의 범주형 데이터를 더미 변수로 변환\n",
        "horsepower_dummies = pd.get_dummies(df['hp_bin'])\n",
        "print(horsepower_dummies.head(15))\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "    저출력  보통출력  고출력\n",
            "0     0     1    0\n",
            "1     0     1    0\n",
            "2     0     1    0\n",
            "3     0     1    0\n",
            "4     0     1    0\n",
            "5     0     0    1\n",
            "6     0     0    1\n",
            "7     0     0    1\n",
            "8     0     0    1\n",
            "9     0     0    1\n",
            "10    0     0    1\n",
            "11    0     1    0\n",
            "12    0     1    0\n",
            "13    0     0    1\n",
            "14    1     0    0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "k6rqjHc_TQ6i"
      },
      "source": [
        "## 원핫인코딩 ##"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5JG-77E8TVVh",
        "outputId": "42e823db-fdd4-410d-ec82-54b44311bc00"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "# read_csv() 함수로 df 생성\n",
        "df = pd.read_csv('./auto-mpg.csv', header=None)\n",
        "\n",
        "# 열 이름을 지정\n",
        "df.columns = ['mpg','cylinders','displacement','horsepower','weight',\n",
        "              'acceleration','model year','origin','name'] \n",
        "\n",
        "# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환\n",
        "df['horsepower'].replace('?', np.nan, inplace=True)      # '?'을 np.nan으로 변경\n",
        "df.dropna(subset=['horsepower'], axis=0, inplace=True)   # 누락데이터 행을 삭제\n",
        "df['horsepower'] = df['horsepower'].astype('float')      # 문자열을 실수형으로 변환\n",
        "\n",
        "# np.histogram 으로 3개의 bin으로 나누는 경계 값의 리스트 구하기\n",
        "count, bin_dividers = np.histogram(df['horsepower'], bins=3)\n",
        "\n",
        "# 3개의 bin에 이름 지정\n",
        "bin_names = ['저출력', '보통출력', '고출력']\n",
        "\n",
        "# pd.cut 으로 각 데이터를 3개의 bin에 할당\n",
        "df['hp_bin'] = pd.cut(x=df['horsepower'],     # 데이터 배열\n",
        "                      bins=bin_dividers,      # 경계 값 리스트\n",
        "                      labels=bin_names,       # bin 이름\n",
        "                      include_lowest=True)    # 첫 경계값 포함\n",
        "\n",
        "# sklern 라이브러리 불러오기\n",
        "from sklearn import preprocessing    \n",
        "\n",
        "# 전처리를 위한 encoder 객체 만들기\n",
        "label_encoder = preprocessing.LabelEncoder()       # label encoder 생성\n",
        "onehot_encoder = preprocessing.OneHotEncoder()   # one hot encoder 생성\n",
        "\n",
        "# label encoder로 문자열 범주를 숫자형 범주로 변환\n",
        "onehot_labeled = label_encoder.fit_transform(df['hp_bin'].head(15))  \n",
        "print(onehot_labeled)\n",
        "print(type(onehot_labeled))\n",
        "\n",
        "# 2차원 행렬로 형태 변경\n",
        "onehot_reshaped = onehot_labeled.reshape(len(onehot_labeled), 1) \n",
        "print(onehot_reshaped)\n",
        "print(type(onehot_reshaped))\n",
        "\n",
        "# 희소행렬로 변환\n",
        "onehot_fitted = onehot_encoder.fit_transform(onehot_reshaped)\n",
        "print(onehot_fitted)\n",
        "print(type(onehot_fitted))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[1 1 1 1 1 0 0 0 0 0 0 1 1 0 2]\n",
            "<class 'numpy.ndarray'>\n",
            "[[1]\n",
            " [1]\n",
            " [1]\n",
            " [1]\n",
            " [1]\n",
            " [0]\n",
            " [0]\n",
            " [0]\n",
            " [0]\n",
            " [0]\n",
            " [0]\n",
            " [1]\n",
            " [1]\n",
            " [0]\n",
            " [2]]\n",
            "<class 'numpy.ndarray'>\n",
            "  (0, 1)\t1.0\n",
            "  (1, 1)\t1.0\n",
            "  (2, 1)\t1.0\n",
            "  (3, 1)\t1.0\n",
            "  (4, 1)\t1.0\n",
            "  (5, 0)\t1.0\n",
            "  (6, 0)\t1.0\n",
            "  (7, 0)\t1.0\n",
            "  (8, 0)\t1.0\n",
            "  (9, 0)\t1.0\n",
            "  (10, 0)\t1.0\n",
            "  (11, 1)\t1.0\n",
            "  (12, 1)\t1.0\n",
            "  (13, 0)\t1.0\n",
            "  (14, 2)\t1.0\n",
            "<class 'scipy.sparse.csr.csr_matrix'>\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "j_w-PgT9V3XA"
      },
      "source": [
        "# 정규화 #\n",
        "- df.horsepower = df.horsepower / abs(df.horsepower.max()) "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jpjRex_kWBbm",
        "outputId": "7bf40d14-f71a-42ab-97b1-cb9b79f67a71"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "# read_csv() 함수로 df 생성\n",
        "df = pd.read_csv('./auto-mpg.csv', header=None)\n",
        "\n",
        "# 열 이름을 지정\n",
        "df.columns = ['mpg','cylinders','displacement','horsepower','weight',\n",
        "              'acceleration','model year','origin','name']  \n",
        "\n",
        "# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환\n",
        "df['horsepower'].replace('?', np.nan, inplace=True)      # '?'을 np.nan으로 변경\n",
        "df.dropna(subset=['horsepower'], axis=0, inplace=True)   # 누락데이터 행을 삭제\n",
        "df['horsepower'] = df['horsepower'].astype('float')      # 문자열을 실수형으로 변환\n",
        "\n",
        "# horsepower 열의 통계 요약정보로 최대값(max)을 확인\n",
        "print(df.horsepower.describe())\n",
        "print('\\n')\n",
        "\n",
        "# horsepower 열의 최대값의 절대값으로 모든 데이터를 나눠서 저장\n",
        "df.horsepower = df.horsepower / abs(df.horsepower.max()) \n",
        "\n",
        "print(df.horsepower.head())\n",
        "print('\\n')\n",
        "print(df.horsepower.describe())"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "count    392.000000\n",
            "mean     104.469388\n",
            "std       38.491160\n",
            "min       46.000000\n",
            "25%       75.000000\n",
            "50%       93.500000\n",
            "75%      126.000000\n",
            "max      230.000000\n",
            "Name: horsepower, dtype: float64\n",
            "\n",
            "\n",
            "0    0.565217\n",
            "1    0.717391\n",
            "2    0.652174\n",
            "3    0.652174\n",
            "4    0.608696\n",
            "Name: horsepower, dtype: float64\n",
            "\n",
            "\n",
            "count    392.000000\n",
            "mean       0.454215\n",
            "std        0.167353\n",
            "min        0.200000\n",
            "25%        0.326087\n",
            "50%        0.406522\n",
            "75%        0.547826\n",
            "max        1.000000\n",
            "Name: horsepower, dtype: float64\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "L6EtlmcQWaQm",
        "outputId": "f21cccf9-bc66-4919-c3c3-8e4cb6d1ee1a"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "# read_csv() 함수로 df 생성\n",
        "df = pd.read_csv('./auto-mpg.csv', header=None)\n",
        "\n",
        "# 열 이름을 지정\n",
        "df.columns = ['mpg','cylinders','displacement','horsepower','weight',\n",
        "              'acceleration','model year','origin','name']  \n",
        "\n",
        "# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환\n",
        "df['horsepower'].replace('?', np.nan, inplace=True)      # '?'을 np.nan으로 변경\n",
        "df.dropna(subset=['horsepower'], axis=0, inplace=True)   # 누락데이터 행을 삭제\n",
        "df['horsepower'] = df['horsepower'].astype('float')      # 문자열을 실수형으로 변환\n",
        "\n",
        "# horsepower 열의 통계 요약정보로 최대값(max)과 최소값(min)을 확인\n",
        "print(df.horsepower.describe())\n",
        "print('\\n')\n",
        "\n",
        "# horsepower 열의 최대값의 절대값으로 모든 데이터를 나눠서 저장\n",
        "min_x = df.horsepower - df.horsepower.min()\n",
        "min_max = df.horsepower.max() - df.horsepower.min()\n",
        "df.horsepower = min_x / min_max\n",
        "\n",
        "print(df.horsepower.head())\n",
        "print('\\n')\n",
        "print(df.horsepower.describe())"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "count    392.000000\n",
            "mean     104.469388\n",
            "std       38.491160\n",
            "min       46.000000\n",
            "25%       75.000000\n",
            "50%       93.500000\n",
            "75%      126.000000\n",
            "max      230.000000\n",
            "Name: horsepower, dtype: float64\n",
            "\n",
            "\n",
            "0    0.456522\n",
            "1    0.646739\n",
            "2    0.565217\n",
            "3    0.565217\n",
            "4    0.510870\n",
            "Name: horsepower, dtype: float64\n",
            "\n",
            "\n",
            "count    392.000000\n",
            "mean       0.317768\n",
            "std        0.209191\n",
            "min        0.000000\n",
            "25%        0.157609\n",
            "50%        0.258152\n",
            "75%        0.434783\n",
            "max        1.000000\n",
            "Name: horsepower, dtype: float64\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wLjJBz7RW6Px"
      },
      "source": [
        "# 시계열 데이터 #"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iYOmW4XmW9Xw"
      },
      "source": [
        "## 다른 자료형을 시계열 객체로 변환 ##"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "RMMYUehEXMmW"
      },
      "source": [
        "### 문자열을 Timestamp로 변환 ###\n",
        "- df.to_datetime() : Timestamp를 나타내는 datetime64로 변환"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xnlAd6gVW8YF",
        "outputId": "c7b4dc3f-265b-4851-b2e0-73b32d02ff40"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# read_csv() 함수로 CSV 파일을 가져와서 df로 변환\n",
        "df = pd.read_csv('stock-data.csv')\n",
        "\n",
        "# 데이터 내용 및 자료형 자료형 확인\n",
        "print(df.head())\n",
        "print('\\n')\n",
        "print(df.info())\n",
        "\n",
        "# 문자열 데이터(시리즈 객체)를 판다스 Timestamp로 변환\n",
        "df['new_Date'] = pd.to_datetime(df['Date'])   #df에 새로운 열로 추가\n",
        "\n",
        "# 데이터 내용 및 자료형 자료형 확인\n",
        "print(df.head())\n",
        "print('\\n')\n",
        "print(df.info())\n",
        "print('\\n')\n",
        "print(type(df['new_Date'][0]))\n",
        "\n",
        "# 시계열 값으로 변환된 열을 새로운 행 인덱스로 지정. 기존 날짜 열은 삭제\n",
        "df.set_index('new_Date', inplace=True)\n",
        "df.drop('Date', axis=1, inplace=True)\n",
        "\n",
        "# 데이터 내용 및 자료형 자료형 확인\n",
        "print(df.head())\n",
        "print('\\n')\n",
        "print(df.info())\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "         Date  Close  Start   High    Low  Volume\n",
            "0  2018-07-02  10100  10850  10900  10000  137977\n",
            "1  2018-06-29  10700  10550  10900   9990  170253\n",
            "2  2018-06-28  10400  10900  10950  10150  155769\n",
            "3  2018-06-27  10900  10800  11050  10500  133548\n",
            "4  2018-06-26  10800  10900  11000  10700   63039\n",
            "\n",
            "\n",
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 20 entries, 0 to 19\n",
            "Data columns (total 6 columns):\n",
            " #   Column  Non-Null Count  Dtype \n",
            "---  ------  --------------  ----- \n",
            " 0   Date    20 non-null     object\n",
            " 1   Close   20 non-null     int64 \n",
            " 2   Start   20 non-null     int64 \n",
            " 3   High    20 non-null     int64 \n",
            " 4   Low     20 non-null     int64 \n",
            " 5   Volume  20 non-null     int64 \n",
            "dtypes: int64(5), object(1)\n",
            "memory usage: 1.1+ KB\n",
            "None\n",
            "         Date  Close  Start   High    Low  Volume   new_Date\n",
            "0  2018-07-02  10100  10850  10900  10000  137977 2018-07-02\n",
            "1  2018-06-29  10700  10550  10900   9990  170253 2018-06-29\n",
            "2  2018-06-28  10400  10900  10950  10150  155769 2018-06-28\n",
            "3  2018-06-27  10900  10800  11050  10500  133548 2018-06-27\n",
            "4  2018-06-26  10800  10900  11000  10700   63039 2018-06-26\n",
            "\n",
            "\n",
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 20 entries, 0 to 19\n",
            "Data columns (total 7 columns):\n",
            " #   Column    Non-Null Count  Dtype         \n",
            "---  ------    --------------  -----         \n",
            " 0   Date      20 non-null     object        \n",
            " 1   Close     20 non-null     int64         \n",
            " 2   Start     20 non-null     int64         \n",
            " 3   High      20 non-null     int64         \n",
            " 4   Low       20 non-null     int64         \n",
            " 5   Volume    20 non-null     int64         \n",
            " 6   new_Date  20 non-null     datetime64[ns]\n",
            "dtypes: datetime64[ns](1), int64(5), object(1)\n",
            "memory usage: 1.2+ KB\n",
            "None\n",
            "\n",
            "\n",
            "<class 'pandas._libs.tslibs.timestamps.Timestamp'>\n",
            "            Close  Start   High    Low  Volume\n",
            "new_Date                                      \n",
            "2018-07-02  10100  10850  10900  10000  137977\n",
            "2018-06-29  10700  10550  10900   9990  170253\n",
            "2018-06-28  10400  10900  10950  10150  155769\n",
            "2018-06-27  10900  10800  11050  10500  133548\n",
            "2018-06-26  10800  10900  11000  10700   63039\n",
            "\n",
            "\n",
            "<class 'pandas.core.frame.DataFrame'>\n",
            "DatetimeIndex: 20 entries, 2018-07-02 to 2018-06-01\n",
            "Data columns (total 5 columns):\n",
            " #   Column  Non-Null Count  Dtype\n",
            "---  ------  --------------  -----\n",
            " 0   Close   20 non-null     int64\n",
            " 1   Start   20 non-null     int64\n",
            " 2   High    20 non-null     int64\n",
            " 3   Low     20 non-null     int64\n",
            " 4   Volume  20 non-null     int64\n",
            "dtypes: int64(5)\n",
            "memory usage: 960.0 bytes\n",
            "None\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Ujcyw-bvYBOe"
      },
      "source": [
        "### Timestamp를 Period로 변환"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jeu6Hh90YEv6",
        "outputId": "faba5c7f-d635-4983-d048-2efc430db368"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# 날짜 형식의 문자열로 구성되는 리스트 정의\n",
        "dates = ['2019-01-01', '2020-03-01', '2021-06-01']\n",
        "\n",
        "# 문자열 데이터(시리즈 객체)를 판다스 Timestamp로 변환\n",
        "ts_dates = pd.to_datetime(dates)   \n",
        "print(ts_dates)\n",
        "print('\\n')\n",
        "\n",
        "# Timestamp를 Period로 변환\n",
        "pr_day = ts_dates.to_period(freq='D')\n",
        "print(pr_day)\n",
        "pr_month = ts_dates.to_period(freq='M')\n",
        "print(pr_month)\n",
        "pr_year = ts_dates.to_period(freq='A')\n",
        "print(pr_year)\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "DatetimeIndex(['2019-01-01', '2020-03-01', '2021-06-01'], dtype='datetime64[ns]', freq=None)\n",
            "\n",
            "\n",
            "PeriodIndex(['2019-01-01', '2020-03-01', '2021-06-01'], dtype='period[D]', freq='D')\n",
            "PeriodIndex(['2019-01', '2020-03', '2021-06'], dtype='period[M]', freq='M')\n",
            "PeriodIndex(['2019', '2020', '2021'], dtype='period[A-DEC]', freq='A-DEC')\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Rd4co2uYYqSP"
      },
      "source": [
        "## 시계열 데이터 만들기 ##"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gSoyKrJ0ZbXG"
      },
      "source": [
        "### Timestamp 배열 ###"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "uqTjFagKYteU",
        "outputId": "e7b0832e-1651-4a3f-80b8-a1514ad62c8d"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# Timestamp의 배열 만들기 - 월 간격, 월의 시작일 기준\n",
        "ts_ms = pd.date_range(start='2019-01-01',    # 날짜 범위의 시작\n",
        "                   end=None,                 # 날짜 범위의 끝\n",
        "                   periods=6,                # 생성할 Timestamp의 개수\n",
        "                   freq='MS',                # 시간 간격 (MS: 월의 시작일)\n",
        "                   tz='Asia/Seoul')          # 시간대(timezone)\n",
        "print(ts_ms)\n",
        "print('\\n')\n",
        "\n",
        "# 월 간격, 월의 마지막 날 기준\n",
        "ts_me = pd.date_range('2019-01-01', periods=6, \n",
        "                   freq='M',              # 시간 간격 (M: 월의 마지막 날)\n",
        "                   tz='Asia/Seoul')       # 시간대(timezone)\n",
        "print(ts_me)\n",
        "print('\\n')\n",
        "\n",
        "# 분기(3개월) 간격, 월의 마지막 날 기준\n",
        "ts_3m = pd.date_range('2019-01-01', periods=6, \n",
        "                   freq='3M',             # 시간 간격 (3M: 3개월)\n",
        "                   tz='Asia/Seoul')       # 시간대(timezone)\n",
        "print(ts_3m)\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "DatetimeIndex(['2019-01-01 00:00:00+09:00', '2019-02-01 00:00:00+09:00',\n",
            "               '2019-03-01 00:00:00+09:00', '2019-04-01 00:00:00+09:00',\n",
            "               '2019-05-01 00:00:00+09:00', '2019-06-01 00:00:00+09:00'],\n",
            "              dtype='datetime64[ns, Asia/Seoul]', freq='MS')\n",
            "\n",
            "\n",
            "DatetimeIndex(['2019-01-31 00:00:00+09:00', '2019-02-28 00:00:00+09:00',\n",
            "               '2019-03-31 00:00:00+09:00', '2019-04-30 00:00:00+09:00',\n",
            "               '2019-05-31 00:00:00+09:00', '2019-06-30 00:00:00+09:00'],\n",
            "              dtype='datetime64[ns, Asia/Seoul]', freq='M')\n",
            "\n",
            "\n",
            "DatetimeIndex(['2019-01-31 00:00:00+09:00', '2019-04-30 00:00:00+09:00',\n",
            "               '2019-07-31 00:00:00+09:00', '2019-10-31 00:00:00+09:00',\n",
            "               '2020-01-31 00:00:00+09:00', '2020-04-30 00:00:00+09:00'],\n",
            "              dtype='datetime64[ns, Asia/Seoul]', freq='3M')\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sVltDvyJZfGG"
      },
      "source": [
        "### Period 배열 ###"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "BtUwca3ZZkAu",
        "outputId": "94485c55-bfb0-4adb-cbad-c4967fe1f8ee"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# Period 배열 만들기 - 1개월 길이\n",
        "pr_m = pd.period_range(start='2019-01-01',     # 날짜 범위의 시작\n",
        "                   end=None,                   # 날짜 범위의 끝\n",
        "                   periods=3,                  # 생성할 Period 개수\n",
        "                   freq='M')                   # 기간의 길이 (M: 월)\n",
        "print(pr_m)\n",
        "print('\\n')\n",
        "\n",
        "# Period 배열 만들기 - 1시간 길이\n",
        "pr_h = pd.period_range(start='2019-01-01',     # 날짜 범위의 시작\n",
        "                   end=None,                   # 날짜 범위의 끝\n",
        "                   periods=3,                  # 생성할 Period 개수\n",
        "                   freq='H')                   # 기간의 길이 (H: 시간)\n",
        "print(pr_h)\n",
        "print('\\n')\n",
        "\n",
        "# Period 배열 만들기 - 2시간 길이\n",
        "pr_2h = pd.period_range(start='2019-01-01',    # 날짜 범위의 시작\n",
        "                   end=None,                   # 날짜 범위의 끝\n",
        "                   periods=3,                  # 생성할 Period 개수\n",
        "                   freq='2H')                  # 기간의 길이 (H: 시간)\n",
        "print(pr_2h)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "PeriodIndex(['2019-01', '2019-02', '2019-03'], dtype='period[M]', freq='M')\n",
            "\n",
            "\n",
            "PeriodIndex(['2019-01-01 00:00', '2019-01-01 01:00', '2019-01-01 02:00'], dtype='period[H]', freq='H')\n",
            "\n",
            "\n",
            "PeriodIndex(['2019-01-01 00:00', '2019-01-01 02:00', '2019-01-01 04:00'], dtype='period[2H]', freq='2H')\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sNyWHNb2aS0W"
      },
      "source": [
        "## 시계열 데이터 활용 ##"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5j4Efbt7aZFt"
      },
      "source": [
        "### 날짜 데이터 분리 ###"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YjRgyEq_aYC2",
        "outputId": "9778dcae-66e5-4d99-a0ed-05e8fa72835a"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# read_csv() 함수로 파일 읽어와서 df로 변환\n",
        "df = pd.read_csv('stock-data.csv')\n",
        "\n",
        "# 문자열인 날짜 데이터를 판다스 Timestamp로 변환\n",
        "df['new_Date'] = pd.to_datetime(df['Date'])   #df에 새로운 열로 추가\n",
        "print(df.head())\n",
        "print('\\n')\n",
        "\n",
        "# dt 속성을 이용하여 new_Date 열의 년월일 정보를 년, 월, 일로 구분\n",
        "df['Year'] = df['new_Date'].dt.year\n",
        "df['Month'] = df['new_Date'].dt.month\n",
        "df['Day'] = df['new_Date'].dt.day\n",
        "print(df.head())\n",
        "print('\\n')\n",
        "\n",
        "# Timestamp를 Period로 변환하여 년월일 표기 변경하기\n",
        "df['Date_yr'] = df['new_Date'].dt.to_period(freq='A')\n",
        "df['Date_m'] = df['new_Date'].dt.to_period(freq='M')\n",
        "print(df.head())\n",
        "print('\\n')\n",
        "\n",
        "# 원하는 열을 새로운 행 인덱스로 지정\n",
        "df.set_index('Date_m', inplace=True)\n",
        "print(df.head())\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "         Date  Close  Start   High    Low  Volume   new_Date\n",
            "0  2018-07-02  10100  10850  10900  10000  137977 2018-07-02\n",
            "1  2018-06-29  10700  10550  10900   9990  170253 2018-06-29\n",
            "2  2018-06-28  10400  10900  10950  10150  155769 2018-06-28\n",
            "3  2018-06-27  10900  10800  11050  10500  133548 2018-06-27\n",
            "4  2018-06-26  10800  10900  11000  10700   63039 2018-06-26\n",
            "\n",
            "\n",
            "         Date  Close  Start   High    Low  Volume   new_Date  Year  Month  Day\n",
            "0  2018-07-02  10100  10850  10900  10000  137977 2018-07-02  2018      7    2\n",
            "1  2018-06-29  10700  10550  10900   9990  170253 2018-06-29  2018      6   29\n",
            "2  2018-06-28  10400  10900  10950  10150  155769 2018-06-28  2018      6   28\n",
            "3  2018-06-27  10900  10800  11050  10500  133548 2018-06-27  2018      6   27\n",
            "4  2018-06-26  10800  10900  11000  10700   63039 2018-06-26  2018      6   26\n",
            "\n",
            "\n",
            "         Date  Close  Start   High    Low  ...  Year Month  Day  Date_yr   Date_m\n",
            "0  2018-07-02  10100  10850  10900  10000  ...  2018     7    2     2018  2018-07\n",
            "1  2018-06-29  10700  10550  10900   9990  ...  2018     6   29     2018  2018-06\n",
            "2  2018-06-28  10400  10900  10950  10150  ...  2018     6   28     2018  2018-06\n",
            "3  2018-06-27  10900  10800  11050  10500  ...  2018     6   27     2018  2018-06\n",
            "4  2018-06-26  10800  10900  11000  10700  ...  2018     6   26     2018  2018-06\n",
            "\n",
            "[5 rows x 12 columns]\n",
            "\n",
            "\n",
            "               Date  Close  Start   High  ...  Year  Month Day  Date_yr\n",
            "Date_m                                    ...                          \n",
            "2018-07  2018-07-02  10100  10850  10900  ...  2018      7   2     2018\n",
            "2018-06  2018-06-29  10700  10550  10900  ...  2018      6  29     2018\n",
            "2018-06  2018-06-28  10400  10900  10950  ...  2018      6  28     2018\n",
            "2018-06  2018-06-27  10900  10800  11050  ...  2018      6  27     2018\n",
            "2018-06  2018-06-26  10800  10900  11000  ...  2018      6  26     2018\n",
            "\n",
            "[5 rows x 11 columns]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TuRmezh6au8l"
      },
      "source": [
        "### 날짜 인덱스 활용 ###"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CpZsKsZHaw8-",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "e24c9f36-a8a2-4e4b-b0bb-0075ef115c61"
      },
      "source": [
        "# -*- coding: utf-8 -*-\n",
        "\n",
        "# 라이브러리 불러오기\n",
        "import pandas as pd\n",
        "\n",
        "# read_csv() 함수로 파일 읽어와서 df로 변환\n",
        "df = pd.read_csv('stock-data.csv')\n",
        "\n",
        "# 문자열인 날짜 데이터를 판다스 Timestamp로 변환\n",
        "df['new_Date'] = pd.to_datetime(df['Date'])   # 새로운 열에 추가\n",
        "df.set_index('new_Date', inplace=True)        # 행 인덱스로 지정\n",
        "\n",
        "print(df.head())\n",
        "print('\\n')\n",
        "print(df.index)\n",
        "print('\\n')\n",
        "\n",
        "# 날짜 인덱스를 이용하여 데이터 선택하기\n",
        "df_y = df['2018']\n",
        "print(df_y.head())\n",
        "print('\\n')\n",
        "df_ym = df.loc['2018-07']    # loc 인덱서 활용\n",
        "print(df_ym)\n",
        "print('\\n')\n",
        "df_ym_cols = df.loc['2018-07', 'Start':'High']    # 열 범위 슬라이싱\n",
        "print(df_ym_cols)\n",
        "print('\\n')\n",
        "df_ymd = df['2018-07-02']\n",
        "print(df_ymd)\n",
        "print('\\n')\n",
        "df_ymd_range = df['2018-06-25':'2018-06-20']    # 날짜 범위 지정\n",
        "print(df_ymd_range)\n",
        "print('\\n')\n",
        "\n",
        "# 시간 간격 계산. 최근 180일 ~ 189일 사이의 값들만 선택하기\n",
        "today = pd.to_datetime('2018-12-25')            # 기준일 생성\n",
        "df['time_delta'] = today - df.index             # 날짜 차이 계산\n",
        "df.set_index('time_delta', inplace=True)        # 행 인덱스로 지정\n",
        "df_180 = df['180 days':'189 days']\n",
        "print(df_180)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "                  Date  Close  Start   High    Low  Volume\n",
            "new_Date                                                  \n",
            "2018-07-02  2018-07-02  10100  10850  10900  10000  137977\n",
            "2018-06-29  2018-06-29  10700  10550  10900   9990  170253\n",
            "2018-06-28  2018-06-28  10400  10900  10950  10150  155769\n",
            "2018-06-27  2018-06-27  10900  10800  11050  10500  133548\n",
            "2018-06-26  2018-06-26  10800  10900  11000  10700   63039\n",
            "\n",
            "\n",
            "DatetimeIndex(['2018-07-02', '2018-06-29', '2018-06-28', '2018-06-27',\n",
            "               '2018-06-26', '2018-06-25', '2018-06-22', '2018-06-21',\n",
            "               '2018-06-20', '2018-06-19', '2018-06-18', '2018-06-15',\n",
            "               '2018-06-14', '2018-06-12', '2018-06-11', '2018-06-08',\n",
            "               '2018-06-07', '2018-06-05', '2018-06-04', '2018-06-01'],\n",
            "              dtype='datetime64[ns]', name='new_Date', freq=None)\n",
            "\n",
            "\n",
            "                  Date  Close  Start   High    Low  Volume\n",
            "new_Date                                                  \n",
            "2018-07-02  2018-07-02  10100  10850  10900  10000  137977\n",
            "2018-06-29  2018-06-29  10700  10550  10900   9990  170253\n",
            "2018-06-28  2018-06-28  10400  10900  10950  10150  155769\n",
            "2018-06-27  2018-06-27  10900  10800  11050  10500  133548\n",
            "2018-06-26  2018-06-26  10800  10900  11000  10700   63039\n",
            "\n",
            "\n",
            "                  Date  Close  Start   High    Low  Volume\n",
            "new_Date                                                  \n",
            "2018-07-02  2018-07-02  10100  10850  10900  10000  137977\n",
            "\n",
            "\n",
            "            Start   High\n",
            "new_Date                \n",
            "2018-07-02  10850  10900\n",
            "\n",
            "\n",
            "                  Date  Close  Start   High    Low  Volume\n",
            "new_Date                                                  \n",
            "2018-07-02  2018-07-02  10100  10850  10900  10000  137977\n",
            "\n",
            "\n",
            "                  Date  Close  Start   High    Low  Volume\n",
            "new_Date                                                  \n",
            "2018-06-25  2018-06-25  11150  11400  11450  11000   55519\n",
            "2018-06-22  2018-06-22  11300  11250  11450  10750  134805\n",
            "2018-06-21  2018-06-21  11200  11350  11750  11200  133002\n",
            "2018-06-20  2018-06-20  11550  11200  11600  10900  308596\n",
            "\n",
            "\n",
            "                  Date  Close  Start   High    Low  Volume\n",
            "time_delta                                                \n",
            "180 days    2018-06-28  10400  10900  10950  10150  155769\n",
            "181 days    2018-06-27  10900  10800  11050  10500  133548\n",
            "182 days    2018-06-26  10800  10900  11000  10700   63039\n",
            "183 days    2018-06-25  11150  11400  11450  11000   55519\n",
            "186 days    2018-06-22  11300  11250  11450  10750  134805\n",
            "187 days    2018-06-21  11200  11350  11750  11200  133002\n",
            "188 days    2018-06-20  11550  11200  11600  10900  308596\n",
            "189 days    2018-06-19  11300  11850  11950  11300  180656\n"
          ]
        }
      ]
    }
  ]
}