zhusimaji/pandas.ipynb

## pandas.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/leiyang/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (12,13,14,15,19,20,81,83,85,87,93,94,95,96,97,98,99,100,105,106,108,109,111,112,114,115,117,118,120,121,123,124,126,127,129,130,132,133,135,136,138,139,141,142,144,145,147,148,150,151,153,154,156,157,160) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  interactivity=interactivity, compiler=compiler, result=result)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>number_of_game</th>\n",
       "      <th>day_of_week</th>\n",
       "      <th>v_name</th>\n",
       "      <th>v_league</th>\n",
       "      <th>v_game_number</th>\n",
       "      <th>h_name</th>\n",
       "      <th>h_league</th>\n",
       "      <th>h_game_number</th>\n",
       "      <th>v_score</th>\n",
       "      <th>...</th>\n",
       "      <th>h_player_7_name</th>\n",
       "      <th>h_player_7_def_pos</th>\n",
       "      <th>h_player_8_id</th>\n",
       "      <th>h_player_8_name</th>\n",
       "      <th>h_player_8_def_pos</th>\n",
       "      <th>h_player_9_id</th>\n",
       "      <th>h_player_9_name</th>\n",
       "      <th>h_player_9_def_pos</th>\n",
       "      <th>additional_info</th>\n",
       "      <th>acquisition_info</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>18710504</td>\n",
       "      <td>0</td>\n",
       "      <td>Thu</td>\n",
       "      <td>CL1</td>\n",
       "      <td>na</td>\n",
       "      <td>1</td>\n",
       "      <td>FW1</td>\n",
       "      <td>na</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>Ed Mincher</td>\n",
       "      <td>7.0</td>\n",
       "      <td>mcdej101</td>\n",
       "      <td>James McDermott</td>\n",
       "      <td>8.0</td>\n",
       "      <td>kellb105</td>\n",
       "      <td>Bill Kelly</td>\n",
       "      <td>9.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>18710505</td>\n",
       "      <td>0</td>\n",
       "      <td>Fri</td>\n",
       "      <td>BS1</td>\n",
       "      <td>na</td>\n",
       "      <td>1</td>\n",
       "      <td>WS3</td>\n",
       "      <td>na</td>\n",
       "      <td>1</td>\n",
       "      <td>20</td>\n",
       "      <td>...</td>\n",
       "      <td>Asa Brainard</td>\n",
       "      <td>1.0</td>\n",
       "      <td>burrh101</td>\n",
       "      <td>Henry Burroughs</td>\n",
       "      <td>9.0</td>\n",
       "      <td>berth101</td>\n",
       "      <td>Henry Berthrong</td>\n",
       "      <td>8.0</td>\n",
       "      <td>HTBF</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>18710506</td>\n",
       "      <td>0</td>\n",
       "      <td>Sat</td>\n",
       "      <td>CL1</td>\n",
       "      <td>na</td>\n",
       "      <td>2</td>\n",
       "      <td>RC1</td>\n",
       "      <td>na</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>...</td>\n",
       "      <td>Pony Sager</td>\n",
       "      <td>6.0</td>\n",
       "      <td>birdg101</td>\n",
       "      <td>George Bird</td>\n",
       "      <td>7.0</td>\n",
       "      <td>stirg101</td>\n",
       "      <td>Gat Stires</td>\n",
       "      <td>9.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>18710508</td>\n",
       "      <td>0</td>\n",
       "      <td>Mon</td>\n",
       "      <td>CL1</td>\n",
       "      <td>na</td>\n",
       "      <td>3</td>\n",
       "      <td>CH1</td>\n",
       "      <td>na</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>...</td>\n",
       "      <td>Ed Duffy</td>\n",
       "      <td>6.0</td>\n",
       "      <td>pinke101</td>\n",
       "      <td>Ed Pinkham</td>\n",
       "      <td>5.0</td>\n",
       "      <td>zettg101</td>\n",
       "      <td>George Zettlein</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>18710509</td>\n",
       "      <td>0</td>\n",
       "      <td>Tue</td>\n",
       "      <td>BS1</td>\n",
       "      <td>na</td>\n",
       "      <td>2</td>\n",
       "      <td>TRO</td>\n",
       "      <td>na</td>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "      <td>...</td>\n",
       "      <td>Steve Bellan</td>\n",
       "      <td>5.0</td>\n",
       "      <td>pikel101</td>\n",
       "      <td>Lip Pike</td>\n",
       "      <td>3.0</td>\n",
       "      <td>cravb101</td>\n",
       "      <td>Bill Craver</td>\n",
       "      <td>6.0</td>\n",
       "      <td>HTBF</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 161 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       date  number_of_game day_of_week v_name v_league  v_game_number h_name  \\\n",
       "0  18710504               0         Thu    CL1       na              1    FW1   \n",
       "1  18710505               0         Fri    BS1       na              1    WS3   \n",
       "2  18710506               0         Sat    CL1       na              2    RC1   \n",
       "3  18710508               0         Mon    CL1       na              3    CH1   \n",
       "4  18710509               0         Tue    BS1       na              2    TRO   \n",
       "\n",
       "  h_league  h_game_number  v_score       ...         h_player_7_name  \\\n",
       "0       na              1        0       ...              Ed Mincher   \n",
       "1       na              1       20       ...            Asa Brainard   \n",
       "2       na              1       12       ...              Pony Sager   \n",
       "3       na              1       12       ...                Ed Duffy   \n",
       "4       na              1        9       ...            Steve Bellan   \n",
       "\n",
       "   h_player_7_def_pos h_player_8_id  h_player_8_name h_player_8_def_pos  \\\n",
       "0                 7.0      mcdej101  James McDermott                8.0   \n",
       "1                 1.0      burrh101  Henry Burroughs                9.0   \n",
       "2                 6.0      birdg101      George Bird                7.0   \n",
       "3                 6.0      pinke101       Ed Pinkham                5.0   \n",
       "4                 5.0      pikel101         Lip Pike                3.0   \n",
       "\n",
       "  h_player_9_id  h_player_9_name  h_player_9_def_pos  additional_info  \\\n",
       "0      kellb105       Bill Kelly                 9.0              NaN   \n",
       "1      berth101  Henry Berthrong                 8.0             HTBF   \n",
       "2      stirg101       Gat Stires                 9.0              NaN   \n",
       "3      zettg101  George Zettlein                 1.0              NaN   \n",
       "4      cravb101      Bill Craver                 6.0             HTBF   \n",
       "\n",
       "  acquisition_info  \n",
       "0                Y  \n",
       "1                Y  \n",
       "2                Y  \n",
       "3                Y  \n",
       "4                Y  \n",
       "\n",
       "[5 rows x 161 columns]"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "import pandas as pd\n",
    "\n",
    "gl = pd.read_csv('game_logs.csv')\n",
    "gl.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 171907 entries, 0 to 171906\n",
      "Columns: 161 entries, date to acquisition_info\n",
      "dtypes: float64(77), int64(6), object(78)\n",
      "memory usage: 738.1 MB\n"
     ]
    }
   ],
   "source": [
    "gl.info(memory_usage='deep')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average memory usage for float columns: 1.29 MB\n",
      "Average memory usage for int columns: 1.12 MB\n",
      "Average memory usage for object columns: 7.96 MB\n"
     ]
    }
   ],
   "source": [
    "for dtype in ['float','int','object']:\n",
    "    selected_dtype = gl.select_dtypes(include=[dtype])\n",
    "    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()\n",
    "    mean_usage_mb = mean_usage_b / 1024 ** 2\n",
    "    print(\"Average memory usage for {} columns: {:03.2f} MB\".format(dtype,mean_usage_mb))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.00 MB\n",
      "1.00 MB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>before</th>\n",
       "      <th>after</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>uint8</th>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uint32</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>int64</th>\n",
       "      <td>6.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        before  after\n",
       "uint8      NaN    5.0\n",
       "uint32     NaN    1.0\n",
       "int64      6.0    NaN"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We're going to be calculating memory usage a lot,\n",
    "# so we'll create a function to save us some time!\n",
    "\n",
    "def mem_usage(pandas_obj):\n",
    "    if isinstance(pandas_obj,pd.DataFrame):\n",
    "        usage_b = pandas_obj.memory_usage(deep=True).sum()\n",
    "    else: # we assume if not a df it's a series\n",
    "        usage_b = pandas_obj.memory_usage(deep=True)\n",
    "    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes\n",
    "    return \"{:03.2f} MB\".format(usage_mb)\n",
    "\n",
    "gl_int = gl.select_dtypes(include=['int'])\n",
    "converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')\n",
    "\n",
    "print(mem_usage(gl_int))\n",
    "print(mem_usage(converted_int))\n",
    "\n",
    "compare_ints = pd.concat([gl_int.dtypes,converted_int.dtypes],axis=1)\n",
    "compare_ints.columns = ['before','after']\n",
    "compare_ints.apply(pd.Series.value_counts)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 查看gl_int和converted_int  compare_ints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       date  number_of_game  v_game_number  h_game_number  v_score  h_score\n",
      "0  18710504               0              1              1        0        2\n",
      "1  18710505               0              1              1       20       18\n",
      "2  18710506               0              2              1       12        4\n",
      "3  18710508               0              3              1       12       14\n",
      "4  18710509               0              2              1        9        5\n",
      "5  18710511               0              2              4       18       10\n",
      "6  18710513               0              2              5       12        8\n",
      "7  18710513               0              3              2       14        5\n",
      "8  18710515               0              3              3        6       12\n",
      "9  18710516               0              2              3       29       14\n",
      "       date  number_of_game  v_game_number  h_game_number  v_score  h_score\n",
      "0  18710504               0              1              1        0        2\n",
      "1  18710505               0              1              1       20       18\n",
      "2  18710506               0              2              1       12        4\n",
      "3  18710508               0              3              1       12       14\n",
      "4  18710509               0              2              1        9        5\n",
      "5  18710511               0              2              4       18       10\n",
      "6  18710513               0              2              5       12        8\n",
      "7  18710513               0              3              2       14        5\n",
      "8  18710515               0              3              3        6       12\n",
      "9  18710516               0              2              3       29       14\n",
      "               before   after\n",
      "date            int64  uint32\n",
      "number_of_game  int64   uint8\n",
      "v_game_number   int64   uint8\n",
      "h_game_number   int64   uint8\n",
      "v_score         int64   uint8\n",
      "h_score         int64   uint8\n"
     ]
    }
   ],
   "source": [
    "print gl_int.head(10)\n",
    "\n",
    "print converted_int.head(10)\n",
    "\n",
    "print compare_ints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100.00 MB\n",
      "50.00 MB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>before</th>\n",
       "      <th>after</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>float32</th>\n",
       "      <td>NaN</td>\n",
       "      <td>77.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>float64</th>\n",
       "      <td>77.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         before  after\n",
       "float32     NaN   77.0\n",
       "float64    77.0    NaN"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gl_float = gl.select_dtypes(include=['float'])\n",
    "converted_float = gl_float.apply(pd.to_numeric,downcast='float')\n",
    "\n",
    "print(mem_usage(gl_float))\n",
    "print(mem_usage(converted_float))\n",
    "\n",
    "compare_floats = pd.concat([gl_float.dtypes,converted_float.dtypes],axis=1)\n",
    "compare_floats.columns = ['before','after']\n",
    "compare_floats.apply(pd.Series.value_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "738.00 MB\n",
      "681.00 MB\n"
     ]
    }
   ],
   "source": [
    "optimized_gl = gl.copy()\n",
    "\n",
    "optimized_gl[converted_int.columns] = converted_int\n",
    "optimized_gl[converted_float.columns] = converted_float\n",
    "\n",
    "print(mem_usage(gl))\n",
    "print(mem_usage(optimized_gl))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>day_of_week</th>\n",
       "      <th>v_name</th>\n",
       "      <th>v_league</th>\n",
       "      <th>h_name</th>\n",
       "      <th>h_league</th>\n",
       "      <th>day_night</th>\n",
       "      <th>completion</th>\n",
       "      <th>forefeit</th>\n",
       "      <th>protest</th>\n",
       "      <th>park_id</th>\n",
       "      <th>...</th>\n",
       "      <th>h_player_6_id</th>\n",
       "      <th>h_player_6_name</th>\n",
       "      <th>h_player_7_id</th>\n",
       "      <th>h_player_7_name</th>\n",
       "      <th>h_player_8_id</th>\n",
       "      <th>h_player_8_name</th>\n",
       "      <th>h_player_9_id</th>\n",
       "      <th>h_player_9_name</th>\n",
       "      <th>additional_info</th>\n",
       "      <th>acquisition_info</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>171907</td>\n",
       "      <td>171907</td>\n",
       "      <td>171907</td>\n",
       "      <td>171907</td>\n",
       "      <td>171907</td>\n",
       "      <td>140150</td>\n",
       "      <td>116</td>\n",
       "      <td>145</td>\n",
       "      <td>180</td>\n",
       "      <td>171907</td>\n",
       "      <td>...</td>\n",
       "      <td>140838</td>\n",
       "      <td>140838</td>\n",
       "      <td>140838</td>\n",
       "      <td>140838</td>\n",
       "      <td>140838</td>\n",
       "      <td>140838</td>\n",
       "      <td>140838</td>\n",
       "      <td>140838</td>\n",
       "      <td>1456</td>\n",
       "      <td>140841</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>7</td>\n",
       "      <td>148</td>\n",
       "      <td>7</td>\n",
       "      <td>148</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>116</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>245</td>\n",
       "      <td>...</td>\n",
       "      <td>4774</td>\n",
       "      <td>4720</td>\n",
       "      <td>5253</td>\n",
       "      <td>5197</td>\n",
       "      <td>4760</td>\n",
       "      <td>4710</td>\n",
       "      <td>5193</td>\n",
       "      <td>5142</td>\n",
       "      <td>332</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>Sat</td>\n",
       "      <td>CHN</td>\n",
       "      <td>NL</td>\n",
       "      <td>CHN</td>\n",
       "      <td>NL</td>\n",
       "      <td>D</td>\n",
       "      <td>19820711,CHI11,5,5,54</td>\n",
       "      <td>H</td>\n",
       "      <td>V</td>\n",
       "      <td>STL07</td>\n",
       "      <td>...</td>\n",
       "      <td>grimc101</td>\n",
       "      <td>Charlie Grimm</td>\n",
       "      <td>grimc101</td>\n",
       "      <td>Charlie Grimm</td>\n",
       "      <td>lopea102</td>\n",
       "      <td>Al Lopez</td>\n",
       "      <td>spahw101</td>\n",
       "      <td>Warren Spahn</td>\n",
       "      <td>HTBF</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>28891</td>\n",
       "      <td>8870</td>\n",
       "      <td>88866</td>\n",
       "      <td>9024</td>\n",
       "      <td>88867</td>\n",
       "      <td>82724</td>\n",
       "      <td>1</td>\n",
       "      <td>69</td>\n",
       "      <td>90</td>\n",
       "      <td>7022</td>\n",
       "      <td>...</td>\n",
       "      <td>427</td>\n",
       "      <td>427</td>\n",
       "      <td>491</td>\n",
       "      <td>491</td>\n",
       "      <td>676</td>\n",
       "      <td>676</td>\n",
       "      <td>339</td>\n",
       "      <td>339</td>\n",
       "      <td>1112</td>\n",
       "      <td>140841</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4 rows × 78 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       day_of_week  v_name v_league  h_name h_league day_night  \\\n",
       "count       171907  171907   171907  171907   171907    140150   \n",
       "unique           7     148        7     148        7         2   \n",
       "top            Sat     CHN       NL     CHN       NL         D   \n",
       "freq         28891    8870    88866    9024    88867     82724   \n",
       "\n",
       "                   completion forefeit protest park_id       ...         \\\n",
       "count                     116      145     180  171907       ...          \n",
       "unique                    116        3       5     245       ...          \n",
       "top     19820711,CHI11,5,5,54        H       V   STL07       ...          \n",
       "freq                        1       69      90    7022       ...          \n",
       "\n",
       "       h_player_6_id h_player_6_name h_player_7_id h_player_7_name  \\\n",
       "count         140838          140838        140838          140838   \n",
       "unique          4774            4720          5253            5197   \n",
       "top         grimc101   Charlie Grimm      grimc101   Charlie Grimm   \n",
       "freq             427             427           491             491   \n",
       "\n",
       "       h_player_8_id h_player_8_name h_player_9_id h_player_9_name  \\\n",
       "count         140838          140838        140838          140838   \n",
       "unique          4760            4710          5193            5142   \n",
       "top         lopea102        Al Lopez      spahw101    Warren Spahn   \n",
       "freq             676             676           339             339   \n",
       "\n",
       "       additional_info acquisition_info  \n",
       "count             1456           140841  \n",
       "unique             332                1  \n",
       "top               HTBF                Y  \n",
       "freq              1112           140841  \n",
       "\n",
       "[4 rows x 78 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gl_obj = gl.select_dtypes(include=['object']).copy()\n",
    "gl_obj.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    Thu\n",
      "1    Fri\n",
      "2    Sat\n",
      "3    Mon\n",
      "4    Tue\n",
      "Name: day_of_week, dtype: object\n",
      "0    Thu\n",
      "1    Fri\n",
      "2    Sat\n",
      "3    Mon\n",
      "4    Tue\n",
      "Name: day_of_week, dtype: category\n",
      "Categories (7, object): [Fri, Mon, Sat, Sun, Thu, Tue, Wed]\n"
     ]
    }
   ],
   "source": [
    "dow = gl_obj.day_of_week\n",
    "print(dow.head())\n",
    "\n",
    "dow_cat = dow.astype('category')\n",
    "print(dow_cat.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    4\n",
       "1    0\n",
       "2    2\n",
       "3    1\n",
       "4    5\n",
       "dtype: int8"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dow_cat.head().cat.codes\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.00 MB\n",
      "0.00 MB\n"
     ]
    }
   ],
   "source": [
    "print(mem_usage(dow))\n",
    "print(mem_usage(dow_cat))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "629.00 MB\n",
      "48.00 MB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Exception TypeError: 'data type not understood' in 'pandas._libs.lib.array_equivalent_object' ignored\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>before</th>\n",
       "      <th>after</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>object</th>\n",
       "      <td>78.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>category</th>\n",
       "      <td>NaN</td>\n",
       "      <td>78.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          before  after\n",
       "object      78.0    NaN\n",
       "category     NaN   78.0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "converted_obj = pd.DataFrame()\n",
    "\n",
    "for col in gl_obj.columns:\n",
    "    num_unique_values = len(gl_obj[col].unique())\n",
    "    num_total_values = len(gl_obj[col])\n",
    "    if num_unique_values / num_total_values < 0.5:\n",
    "        converted_obj.loc[:,col] = gl_obj[col].astype('category')\n",
    "    else:\n",
    "        converted_obj.loc[:,col] = gl_obj[col]\n",
    "print(mem_usage(gl_obj))\n",
    "print(mem_usage(converted_obj))\n",
    "\n",
    "compare_obj = pd.concat([gl_obj.dtypes,converted_obj.dtypes],axis=1)\n",
    "compare_obj.columns = ['before','after']\n",
    "compare_obj.apply(pd.Series.value_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/Users/leiyang/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (12,13,14,15,19,20,81,83,85,87,93,94,95,96,97,98,99,100,105,106,108,109,111,112,114,115,117,118,120,121,123,124,126,127,129,130,132,133,135,136,138,139,141,142,144,145,147,148,150,151,153,154,156,157,160) have mixed types. Specify dtype option on import or set low_memory=False.\n",
	" interactivity=interactivity, compiler=compiler, result=result)\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>date</th>\n",
	" <th>number_of_game</th>\n",
	" <th>day_of_week</th>\n",
	" <th>v_name</th>\n",
	" <th>v_league</th>\n",
	" <th>v_game_number</th>\n",
	" <th>h_name</th>\n",
	" <th>h_league</th>\n",
	" <th>h_game_number</th>\n",
	" <th>v_score</th>\n",
	" <th>...</th>\n",
	" <th>h_player_7_name</th>\n",
	" <th>h_player_7_def_pos</th>\n",
	" <th>h_player_8_id</th>\n",
	" <th>h_player_8_name</th>\n",
	" <th>h_player_8_def_pos</th>\n",
	" <th>h_player_9_id</th>\n",
	" <th>h_player_9_name</th>\n",
	" <th>h_player_9_def_pos</th>\n",
	" <th>additional_info</th>\n",
	" <th>acquisition_info</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>18710504</td>\n",
	" <td>0</td>\n",
	" <td>Thu</td>\n",
	" <td>CL1</td>\n",
	" <td>na</td>\n",
	" <td>1</td>\n",
	" <td>FW1</td>\n",
	" <td>na</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>...</td>\n",
	" <td>Ed Mincher</td>\n",
	" <td>7.0</td>\n",
	" <td>mcdej101</td>\n",
	" <td>James McDermott</td>\n",
	" <td>8.0</td>\n",
	" <td>kellb105</td>\n",
	" <td>Bill Kelly</td>\n",
	" <td>9.0</td>\n",
	" <td>NaN</td>\n",
	" <td>Y</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>18710505</td>\n",
	" <td>0</td>\n",
	" <td>Fri</td>\n",
	" <td>BS1</td>\n",
	" <td>na</td>\n",
	" <td>1</td>\n",
	" <td>WS3</td>\n",
	" <td>na</td>\n",
	" <td>1</td>\n",
	" <td>20</td>\n",
	" <td>...</td>\n",
	" <td>Asa Brainard</td>\n",
	" <td>1.0</td>\n",
	" <td>burrh101</td>\n",
	" <td>Henry Burroughs</td>\n",
	" <td>9.0</td>\n",
	" <td>berth101</td>\n",
	" <td>Henry Berthrong</td>\n",
	" <td>8.0</td>\n",
	" <td>HTBF</td>\n",
	" <td>Y</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>18710506</td>\n",
	" <td>0</td>\n",
	" <td>Sat</td>\n",
	" <td>CL1</td>\n",
	" <td>na</td>\n",
	" <td>2</td>\n",
	" <td>RC1</td>\n",
	" <td>na</td>\n",
	" <td>1</td>\n",
	" <td>12</td>\n",
	" <td>...</td>\n",
	" <td>Pony Sager</td>\n",
	" <td>6.0</td>\n",
	" <td>birdg101</td>\n",
	" <td>George Bird</td>\n",
	" <td>7.0</td>\n",
	" <td>stirg101</td>\n",
	" <td>Gat Stires</td>\n",
	" <td>9.0</td>\n",
	" <td>NaN</td>\n",
	" <td>Y</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>18710508</td>\n",
	" <td>0</td>\n",
	" <td>Mon</td>\n",
	" <td>CL1</td>\n",
	" <td>na</td>\n",
	" <td>3</td>\n",
	" <td>CH1</td>\n",
	" <td>na</td>\n",
	" <td>1</td>\n",
	" <td>12</td>\n",
	" <td>...</td>\n",
	" <td>Ed Duffy</td>\n",
	" <td>6.0</td>\n",
	" <td>pinke101</td>\n",
	" <td>Ed Pinkham</td>\n",
	" <td>5.0</td>\n",
	" <td>zettg101</td>\n",
	" <td>George Zettlein</td>\n",
	" <td>1.0</td>\n",
	" <td>NaN</td>\n",
	" <td>Y</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>18710509</td>\n",
	" <td>0</td>\n",
	" <td>Tue</td>\n",
	" <td>BS1</td>\n",
	" <td>na</td>\n",
	" <td>2</td>\n",
	" <td>TRO</td>\n",
	" <td>na</td>\n",
	" <td>1</td>\n",
	" <td>9</td>\n",
	" <td>...</td>\n",
	" <td>Steve Bellan</td>\n",
	" <td>5.0</td>\n",
	" <td>pikel101</td>\n",
	" <td>Lip Pike</td>\n",
	" <td>3.0</td>\n",
	" <td>cravb101</td>\n",
	" <td>Bill Craver</td>\n",
	" <td>6.0</td>\n",
	" <td>HTBF</td>\n",
	" <td>Y</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>5 rows × 161 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" date number_of_game day_of_week v_name v_league v_game_number h_name \\\n",
	"0 18710504 0 Thu CL1 na 1 FW1 \n",
	"1 18710505 0 Fri BS1 na 1 WS3 \n",
	"2 18710506 0 Sat CL1 na 2 RC1 \n",
	"3 18710508 0 Mon CL1 na 3 CH1 \n",
	"4 18710509 0 Tue BS1 na 2 TRO \n",
	"\n",
	" h_league h_game_number v_score ... h_player_7_name \\\n",
	"0 na 1 0 ... Ed Mincher \n",
	"1 na 1 20 ... Asa Brainard \n",
	"2 na 1 12 ... Pony Sager \n",
	"3 na 1 12 ... Ed Duffy \n",
	"4 na 1 9 ... Steve Bellan \n",
	"\n",
	" h_player_7_def_pos h_player_8_id h_player_8_name h_player_8_def_pos \\\n",
	"0 7.0 mcdej101 James McDermott 8.0 \n",
	"1 1.0 burrh101 Henry Burroughs 9.0 \n",
	"2 6.0 birdg101 George Bird 7.0 \n",
	"3 6.0 pinke101 Ed Pinkham 5.0 \n",
	"4 5.0 pikel101 Lip Pike 3.0 \n",
	"\n",
	" h_player_9_id h_player_9_name h_player_9_def_pos additional_info \\\n",
	"0 kellb105 Bill Kelly 9.0 NaN \n",
	"1 berth101 Henry Berthrong 8.0 HTBF \n",
	"2 stirg101 Gat Stires 9.0 NaN \n",
	"3 zettg101 George Zettlein 1.0 NaN \n",
	"4 cravb101 Bill Craver 6.0 HTBF \n",
	"\n",
	" acquisition_info \n",
	"0 Y \n",
	"1 Y \n",
	"2 Y \n",
	"3 Y \n",
	"4 Y \n",
	"\n",
	"[5 rows x 161 columns]"
	]
	},
	"execution_count": 1,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"\n",
	"import pandas as pd\n",
	"\n",
	"gl = pd.read_csv('game_logs.csv')\n",
	"gl.head()\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"<class 'pandas.core.frame.DataFrame'>\n",
	"RangeIndex: 171907 entries, 0 to 171906\n",
	"Columns: 161 entries, date to acquisition_info\n",
	"dtypes: float64(77), int64(6), object(78)\n",
	"memory usage: 738.1 MB\n"
	]
	}
	],
	"source": [
	"gl.info(memory_usage='deep')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Average memory usage for float columns: 1.29 MB\n",
	"Average memory usage for int columns: 1.12 MB\n",
	"Average memory usage for object columns: 7.96 MB\n"
	]
	}
	],
	"source": [
	"for dtype in ['float','int','object']:\n",
	" selected_dtype = gl.select_dtypes(include=[dtype])\n",
	" mean_usage_b = selected_dtype.memory_usage(deep=True).mean()\n",
	" mean_usage_mb = mean_usage_b / 1024 ** 2\n",
	" print(\"Average memory usage for {} columns: {:03.2f} MB\".format(dtype,mean_usage_mb))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"7.00 MB\n",
	"1.00 MB\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>before</th>\n",
	" <th>after</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>uint8</th>\n",
	" <td>NaN</td>\n",
	" <td>5.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>uint32</th>\n",
	" <td>NaN</td>\n",
	" <td>1.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>int64</th>\n",
	" <td>6.0</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" before after\n",
	"uint8 NaN 5.0\n",
	"uint32 NaN 1.0\n",
	"int64 6.0 NaN"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# We're going to be calculating memory usage a lot,\n",
	"# so we'll create a function to save us some time!\n",
	"\n",
	"def mem_usage(pandas_obj):\n",
	" if isinstance(pandas_obj,pd.DataFrame):\n",
	" usage_b = pandas_obj.memory_usage(deep=True).sum()\n",
	" else: # we assume if not a df it's a series\n",
	" usage_b = pandas_obj.memory_usage(deep=True)\n",
	" usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes\n",
	" return \"{:03.2f} MB\".format(usage_mb)\n",
	"\n",
	"gl_int = gl.select_dtypes(include=['int'])\n",
	"converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')\n",
	"\n",
	"print(mem_usage(gl_int))\n",
	"print(mem_usage(converted_int))\n",
	"\n",
	"compare_ints = pd.concat([gl_int.dtypes,converted_int.dtypes],axis=1)\n",
	"compare_ints.columns = ['before','after']\n",
	"compare_ints.apply(pd.Series.value_counts)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## 查看gl_int和converted_int compare_ints"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" date number_of_game v_game_number h_game_number v_score h_score\n",
	"0 18710504 0 1 1 0 2\n",
	"1 18710505 0 1 1 20 18\n",
	"2 18710506 0 2 1 12 4\n",
	"3 18710508 0 3 1 12 14\n",
	"4 18710509 0 2 1 9 5\n",
	"5 18710511 0 2 4 18 10\n",
	"6 18710513 0 2 5 12 8\n",
	"7 18710513 0 3 2 14 5\n",
	"8 18710515 0 3 3 6 12\n",
	"9 18710516 0 2 3 29 14\n",
	" date number_of_game v_game_number h_game_number v_score h_score\n",
	"0 18710504 0 1 1 0 2\n",
	"1 18710505 0 1 1 20 18\n",
	"2 18710506 0 2 1 12 4\n",
	"3 18710508 0 3 1 12 14\n",
	"4 18710509 0 2 1 9 5\n",
	"5 18710511 0 2 4 18 10\n",
	"6 18710513 0 2 5 12 8\n",
	"7 18710513 0 3 2 14 5\n",
	"8 18710515 0 3 3 6 12\n",
	"9 18710516 0 2 3 29 14\n",
	" before after\n",
	"date int64 uint32\n",
	"number_of_game int64 uint8\n",
	"v_game_number int64 uint8\n",
	"h_game_number int64 uint8\n",
	"v_score int64 uint8\n",
	"h_score int64 uint8\n"
	]
	}
	],
	"source": [
	"print gl_int.head(10)\n",
	"\n",
	"print converted_int.head(10)\n",
	"\n",
	"print compare_ints"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"100.00 MB\n",
	"50.00 MB\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>before</th>\n",
	" <th>after</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>float32</th>\n",
	" <td>NaN</td>\n",
	" <td>77.0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>float64</th>\n",
	" <td>77.0</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" before after\n",
	"float32 NaN 77.0\n",
	"float64 77.0 NaN"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"gl_float = gl.select_dtypes(include=['float'])\n",
	"converted_float = gl_float.apply(pd.to_numeric,downcast='float')\n",
	"\n",
	"print(mem_usage(gl_float))\n",
	"print(mem_usage(converted_float))\n",
	"\n",
	"compare_floats = pd.concat([gl_float.dtypes,converted_float.dtypes],axis=1)\n",
	"compare_floats.columns = ['before','after']\n",
	"compare_floats.apply(pd.Series.value_counts)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"738.00 MB\n",
	"681.00 MB\n"
	]
	}
	],
	"source": [
	"optimized_gl = gl.copy()\n",
	"\n",
	"optimized_gl[converted_int.columns] = converted_int\n",
	"optimized_gl[converted_float.columns] = converted_float\n",
	"\n",
	"print(mem_usage(gl))\n",
	"print(mem_usage(optimized_gl))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>day_of_week</th>\n",
	" <th>v_name</th>\n",
	" <th>v_league</th>\n",
	" <th>h_name</th>\n",
	" <th>h_league</th>\n",
	" <th>day_night</th>\n",
	" <th>completion</th>\n",
	" <th>forefeit</th>\n",
	" <th>protest</th>\n",
	" <th>park_id</th>\n",
	" <th>...</th>\n",
	" <th>h_player_6_id</th>\n",
	" <th>h_player_6_name</th>\n",
	" <th>h_player_7_id</th>\n",
	" <th>h_player_7_name</th>\n",
	" <th>h_player_8_id</th>\n",
	" <th>h_player_8_name</th>\n",
	" <th>h_player_9_id</th>\n",
	" <th>h_player_9_name</th>\n",
	" <th>additional_info</th>\n",
	" <th>acquisition_info</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>count</th>\n",
	" <td>171907</td>\n",
	" <td>171907</td>\n",
	" <td>171907</td>\n",
	" <td>171907</td>\n",
	" <td>171907</td>\n",
	" <td>140150</td>\n",
	" <td>116</td>\n",
	" <td>145</td>\n",
	" <td>180</td>\n",
	" <td>171907</td>\n",
	" <td>...</td>\n",
	" <td>140838</td>\n",
	" <td>140838</td>\n",
	" <td>140838</td>\n",
	" <td>140838</td>\n",
	" <td>140838</td>\n",
	" <td>140838</td>\n",
	" <td>140838</td>\n",
	" <td>140838</td>\n",
	" <td>1456</td>\n",
	" <td>140841</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>unique</th>\n",
	" <td>7</td>\n",
	" <td>148</td>\n",
	" <td>7</td>\n",
	" <td>148</td>\n",
	" <td>7</td>\n",
	" <td>2</td>\n",
	" <td>116</td>\n",
	" <td>3</td>\n",
	" <td>5</td>\n",
	" <td>245</td>\n",
	" <td>...</td>\n",
	" <td>4774</td>\n",
	" <td>4720</td>\n",
	" <td>5253</td>\n",
	" <td>5197</td>\n",
	" <td>4760</td>\n",
	" <td>4710</td>\n",
	" <td>5193</td>\n",
	" <td>5142</td>\n",
	" <td>332</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>top</th>\n",
	" <td>Sat</td>\n",
	" <td>CHN</td>\n",
	" <td>NL</td>\n",
	" <td>CHN</td>\n",
	" <td>NL</td>\n",
	" <td>D</td>\n",
	" <td>19820711,CHI11,5,5,54</td>\n",
	" <td>H</td>\n",
	" <td>V</td>\n",
	" <td>STL07</td>\n",
	" <td>...</td>\n",
	" <td>grimc101</td>\n",
	" <td>Charlie Grimm</td>\n",
	" <td>grimc101</td>\n",
	" <td>Charlie Grimm</td>\n",
	" <td>lopea102</td>\n",
	" <td>Al Lopez</td>\n",
	" <td>spahw101</td>\n",
	" <td>Warren Spahn</td>\n",
	" <td>HTBF</td>\n",
	" <td>Y</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>freq</th>\n",
	" <td>28891</td>\n",
	" <td>8870</td>\n",
	" <td>88866</td>\n",
	" <td>9024</td>\n",
	" <td>88867</td>\n",
	" <td>82724</td>\n",
	" <td>1</td>\n",
	" <td>69</td>\n",
	" <td>90</td>\n",
	" <td>7022</td>\n",
	" <td>...</td>\n",
	" <td>427</td>\n",
	" <td>427</td>\n",
	" <td>491</td>\n",
	" <td>491</td>\n",
	" <td>676</td>\n",
	" <td>676</td>\n",
	" <td>339</td>\n",
	" <td>339</td>\n",
	" <td>1112</td>\n",
	" <td>140841</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>4 rows × 78 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" day_of_week v_name v_league h_name h_league day_night \\\n",
	"count 171907 171907 171907 171907 171907 140150 \n",
	"unique 7 148 7 148 7 2 \n",
	"top Sat CHN NL CHN NL D \n",
	"freq 28891 8870 88866 9024 88867 82724 \n",
	"\n",
	" completion forefeit protest park_id ... \\\n",
	"count 116 145 180 171907 ... \n",
	"unique 116 3 5 245 ... \n",
	"top 19820711,CHI11,5,5,54 H V STL07 ... \n",
	"freq 1 69 90 7022 ... \n",
	"\n",
	" h_player_6_id h_player_6_name h_player_7_id h_player_7_name \\\n",
	"count 140838 140838 140838 140838 \n",
	"unique 4774 4720 5253 5197 \n",
	"top grimc101 Charlie Grimm grimc101 Charlie Grimm \n",
	"freq 427 427 491 491 \n",
	"\n",
	" h_player_8_id h_player_8_name h_player_9_id h_player_9_name \\\n",
	"count 140838 140838 140838 140838 \n",
	"unique 4760 4710 5193 5142 \n",
	"top lopea102 Al Lopez spahw101 Warren Spahn \n",
	"freq 676 676 339 339 \n",
	"\n",
	" additional_info acquisition_info \n",
	"count 1456 140841 \n",
	"unique 332 1 \n",
	"top HTBF Y \n",
	"freq 1112 140841 \n",
	"\n",
	"[4 rows x 78 columns]"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"gl_obj = gl.select_dtypes(include=['object']).copy()\n",
	"gl_obj.describe()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0 Thu\n",
	"1 Fri\n",
	"2 Sat\n",
	"3 Mon\n",
	"4 Tue\n",
	"Name: day_of_week, dtype: object\n",
	"0 Thu\n",
	"1 Fri\n",
	"2 Sat\n",
	"3 Mon\n",
	"4 Tue\n",
	"Name: day_of_week, dtype: category\n",
	"Categories (7, object): [Fri, Mon, Sat, Sun, Thu, Tue, Wed]\n"
	]
	}
	],
	"source": [
	"dow = gl_obj.day_of_week\n",
	"print(dow.head())\n",
	"\n",
	"dow_cat = dow.astype('category')\n",
	"print(dow_cat.head())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0 4\n",
	"1 0\n",
	"2 2\n",
	"3 1\n",
	"4 5\n",
	"dtype: int8"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dow_cat.head().cat.codes\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"7.00 MB\n",
	"0.00 MB\n"
	]
	}
	],
	"source": [
	"print(mem_usage(dow))\n",
	"print(mem_usage(dow_cat))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"629.00 MB\n",
	"48.00 MB\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Exception TypeError: 'data type not understood' in 'pandas._libs.lib.array_equivalent_object' ignored\n"
	]
	},
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>before</th>\n",
	" <th>after</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>object</th>\n",
	" <td>78.0</td>\n",
	" <td>NaN</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>category</th>\n",
	" <td>NaN</td>\n",
	" <td>78.0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" before after\n",
	"object 78.0 NaN\n",
	"category NaN 78.0"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"converted_obj = pd.DataFrame()\n",
	"\n",
	"for col in gl_obj.columns:\n",
	" num_unique_values = len(gl_obj[col].unique())\n",
	" num_total_values = len(gl_obj[col])\n",
	" if num_unique_values / num_total_values < 0.5:\n",
	" converted_obj.loc[:,col] = gl_obj[col].astype('category')\n",
	" else:\n",
	" converted_obj.loc[:,col] = gl_obj[col]\n",
	"print(mem_usage(gl_obj))\n",
	"print(mem_usage(converted_obj))\n",
	"\n",
	"compare_obj = pd.concat([gl_obj.dtypes,converted_obj.dtypes],axis=1)\n",
	"compare_obj.columns = ['before','after']\n",
	"compare_obj.apply(pd.Series.value_counts)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}