Skip to content

Instantly share code, notes, and snippets.

@zhusimaji
Created March 15, 2018 09:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zhusimaji/f428e45758a67579ff8e4a5679e761b4 to your computer and use it in GitHub Desktop.
Save zhusimaji/f428e45758a67579ff8e4a5679e761b4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/leiyang/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (12,13,14,15,19,20,81,83,85,87,93,94,95,96,97,98,99,100,105,106,108,109,111,112,114,115,117,118,120,121,123,124,126,127,129,130,132,133,135,136,138,139,141,142,144,145,147,148,150,151,153,154,156,157,160) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>number_of_game</th>\n",
" <th>day_of_week</th>\n",
" <th>v_name</th>\n",
" <th>v_league</th>\n",
" <th>v_game_number</th>\n",
" <th>h_name</th>\n",
" <th>h_league</th>\n",
" <th>h_game_number</th>\n",
" <th>v_score</th>\n",
" <th>...</th>\n",
" <th>h_player_7_name</th>\n",
" <th>h_player_7_def_pos</th>\n",
" <th>h_player_8_id</th>\n",
" <th>h_player_8_name</th>\n",
" <th>h_player_8_def_pos</th>\n",
" <th>h_player_9_id</th>\n",
" <th>h_player_9_name</th>\n",
" <th>h_player_9_def_pos</th>\n",
" <th>additional_info</th>\n",
" <th>acquisition_info</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>18710504</td>\n",
" <td>0</td>\n",
" <td>Thu</td>\n",
" <td>CL1</td>\n",
" <td>na</td>\n",
" <td>1</td>\n",
" <td>FW1</td>\n",
" <td>na</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>Ed Mincher</td>\n",
" <td>7.0</td>\n",
" <td>mcdej101</td>\n",
" <td>James McDermott</td>\n",
" <td>8.0</td>\n",
" <td>kellb105</td>\n",
" <td>Bill Kelly</td>\n",
" <td>9.0</td>\n",
" <td>NaN</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>18710505</td>\n",
" <td>0</td>\n",
" <td>Fri</td>\n",
" <td>BS1</td>\n",
" <td>na</td>\n",
" <td>1</td>\n",
" <td>WS3</td>\n",
" <td>na</td>\n",
" <td>1</td>\n",
" <td>20</td>\n",
" <td>...</td>\n",
" <td>Asa Brainard</td>\n",
" <td>1.0</td>\n",
" <td>burrh101</td>\n",
" <td>Henry Burroughs</td>\n",
" <td>9.0</td>\n",
" <td>berth101</td>\n",
" <td>Henry Berthrong</td>\n",
" <td>8.0</td>\n",
" <td>HTBF</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>18710506</td>\n",
" <td>0</td>\n",
" <td>Sat</td>\n",
" <td>CL1</td>\n",
" <td>na</td>\n",
" <td>2</td>\n",
" <td>RC1</td>\n",
" <td>na</td>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>...</td>\n",
" <td>Pony Sager</td>\n",
" <td>6.0</td>\n",
" <td>birdg101</td>\n",
" <td>George Bird</td>\n",
" <td>7.0</td>\n",
" <td>stirg101</td>\n",
" <td>Gat Stires</td>\n",
" <td>9.0</td>\n",
" <td>NaN</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>18710508</td>\n",
" <td>0</td>\n",
" <td>Mon</td>\n",
" <td>CL1</td>\n",
" <td>na</td>\n",
" <td>3</td>\n",
" <td>CH1</td>\n",
" <td>na</td>\n",
" <td>1</td>\n",
" <td>12</td>\n",
" <td>...</td>\n",
" <td>Ed Duffy</td>\n",
" <td>6.0</td>\n",
" <td>pinke101</td>\n",
" <td>Ed Pinkham</td>\n",
" <td>5.0</td>\n",
" <td>zettg101</td>\n",
" <td>George Zettlein</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>18710509</td>\n",
" <td>0</td>\n",
" <td>Tue</td>\n",
" <td>BS1</td>\n",
" <td>na</td>\n",
" <td>2</td>\n",
" <td>TRO</td>\n",
" <td>na</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>...</td>\n",
" <td>Steve Bellan</td>\n",
" <td>5.0</td>\n",
" <td>pikel101</td>\n",
" <td>Lip Pike</td>\n",
" <td>3.0</td>\n",
" <td>cravb101</td>\n",
" <td>Bill Craver</td>\n",
" <td>6.0</td>\n",
" <td>HTBF</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 161 columns</p>\n",
"</div>"
],
"text/plain": [
" date number_of_game day_of_week v_name v_league v_game_number h_name \\\n",
"0 18710504 0 Thu CL1 na 1 FW1 \n",
"1 18710505 0 Fri BS1 na 1 WS3 \n",
"2 18710506 0 Sat CL1 na 2 RC1 \n",
"3 18710508 0 Mon CL1 na 3 CH1 \n",
"4 18710509 0 Tue BS1 na 2 TRO \n",
"\n",
" h_league h_game_number v_score ... h_player_7_name \\\n",
"0 na 1 0 ... Ed Mincher \n",
"1 na 1 20 ... Asa Brainard \n",
"2 na 1 12 ... Pony Sager \n",
"3 na 1 12 ... Ed Duffy \n",
"4 na 1 9 ... Steve Bellan \n",
"\n",
" h_player_7_def_pos h_player_8_id h_player_8_name h_player_8_def_pos \\\n",
"0 7.0 mcdej101 James McDermott 8.0 \n",
"1 1.0 burrh101 Henry Burroughs 9.0 \n",
"2 6.0 birdg101 George Bird 7.0 \n",
"3 6.0 pinke101 Ed Pinkham 5.0 \n",
"4 5.0 pikel101 Lip Pike 3.0 \n",
"\n",
" h_player_9_id h_player_9_name h_player_9_def_pos additional_info \\\n",
"0 kellb105 Bill Kelly 9.0 NaN \n",
"1 berth101 Henry Berthrong 8.0 HTBF \n",
"2 stirg101 Gat Stires 9.0 NaN \n",
"3 zettg101 George Zettlein 1.0 NaN \n",
"4 cravb101 Bill Craver 6.0 HTBF \n",
"\n",
" acquisition_info \n",
"0 Y \n",
"1 Y \n",
"2 Y \n",
"3 Y \n",
"4 Y \n",
"\n",
"[5 rows x 161 columns]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"import pandas as pd\n",
"\n",
"gl = pd.read_csv('game_logs.csv')\n",
"gl.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 171907 entries, 0 to 171906\n",
"Columns: 161 entries, date to acquisition_info\n",
"dtypes: float64(77), int64(6), object(78)\n",
"memory usage: 738.1 MB\n"
]
}
],
"source": [
"gl.info(memory_usage='deep')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average memory usage for float columns: 1.29 MB\n",
"Average memory usage for int columns: 1.12 MB\n",
"Average memory usage for object columns: 7.96 MB\n"
]
}
],
"source": [
"for dtype in ['float','int','object']:\n",
" selected_dtype = gl.select_dtypes(include=[dtype])\n",
" mean_usage_b = selected_dtype.memory_usage(deep=True).mean()\n",
" mean_usage_mb = mean_usage_b / 1024 ** 2\n",
" print(\"Average memory usage for {} columns: {:03.2f} MB\".format(dtype,mean_usage_mb))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7.00 MB\n",
"1.00 MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>before</th>\n",
" <th>after</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>uint8</th>\n",
" <td>NaN</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>uint32</th>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>int64</th>\n",
" <td>6.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" before after\n",
"uint8 NaN 5.0\n",
"uint32 NaN 1.0\n",
"int64 6.0 NaN"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We're going to be calculating memory usage a lot,\n",
"# so we'll create a function to save us some time!\n",
"\n",
"def mem_usage(pandas_obj):\n",
" if isinstance(pandas_obj,pd.DataFrame):\n",
" usage_b = pandas_obj.memory_usage(deep=True).sum()\n",
" else: # we assume if not a df it's a series\n",
" usage_b = pandas_obj.memory_usage(deep=True)\n",
" usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes\n",
" return \"{:03.2f} MB\".format(usage_mb)\n",
"\n",
"gl_int = gl.select_dtypes(include=['int'])\n",
"converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')\n",
"\n",
"print(mem_usage(gl_int))\n",
"print(mem_usage(converted_int))\n",
"\n",
"compare_ints = pd.concat([gl_int.dtypes,converted_int.dtypes],axis=1)\n",
"compare_ints.columns = ['before','after']\n",
"compare_ints.apply(pd.Series.value_counts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 查看gl_int和converted_int compare_ints"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" date number_of_game v_game_number h_game_number v_score h_score\n",
"0 18710504 0 1 1 0 2\n",
"1 18710505 0 1 1 20 18\n",
"2 18710506 0 2 1 12 4\n",
"3 18710508 0 3 1 12 14\n",
"4 18710509 0 2 1 9 5\n",
"5 18710511 0 2 4 18 10\n",
"6 18710513 0 2 5 12 8\n",
"7 18710513 0 3 2 14 5\n",
"8 18710515 0 3 3 6 12\n",
"9 18710516 0 2 3 29 14\n",
" date number_of_game v_game_number h_game_number v_score h_score\n",
"0 18710504 0 1 1 0 2\n",
"1 18710505 0 1 1 20 18\n",
"2 18710506 0 2 1 12 4\n",
"3 18710508 0 3 1 12 14\n",
"4 18710509 0 2 1 9 5\n",
"5 18710511 0 2 4 18 10\n",
"6 18710513 0 2 5 12 8\n",
"7 18710513 0 3 2 14 5\n",
"8 18710515 0 3 3 6 12\n",
"9 18710516 0 2 3 29 14\n",
" before after\n",
"date int64 uint32\n",
"number_of_game int64 uint8\n",
"v_game_number int64 uint8\n",
"h_game_number int64 uint8\n",
"v_score int64 uint8\n",
"h_score int64 uint8\n"
]
}
],
"source": [
"print gl_int.head(10)\n",
"\n",
"print converted_int.head(10)\n",
"\n",
"print compare_ints"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100.00 MB\n",
"50.00 MB\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>before</th>\n",
" <th>after</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>float32</th>\n",
" <td>NaN</td>\n",
" <td>77.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>float64</th>\n",
" <td>77.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" before after\n",
"float32 NaN 77.0\n",
"float64 77.0 NaN"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gl_float = gl.select_dtypes(include=['float'])\n",
"converted_float = gl_float.apply(pd.to_numeric,downcast='float')\n",
"\n",
"print(mem_usage(gl_float))\n",
"print(mem_usage(converted_float))\n",
"\n",
"compare_floats = pd.concat([gl_float.dtypes,converted_float.dtypes],axis=1)\n",
"compare_floats.columns = ['before','after']\n",
"compare_floats.apply(pd.Series.value_counts)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"738.00 MB\n",
"681.00 MB\n"
]
}
],
"source": [
"optimized_gl = gl.copy()\n",
"\n",
"optimized_gl[converted_int.columns] = converted_int\n",
"optimized_gl[converted_float.columns] = converted_float\n",
"\n",
"print(mem_usage(gl))\n",
"print(mem_usage(optimized_gl))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>day_of_week</th>\n",
" <th>v_name</th>\n",
" <th>v_league</th>\n",
" <th>h_name</th>\n",
" <th>h_league</th>\n",
" <th>day_night</th>\n",
" <th>completion</th>\n",
" <th>forefeit</th>\n",
" <th>protest</th>\n",
" <th>park_id</th>\n",
" <th>...</th>\n",
" <th>h_player_6_id</th>\n",
" <th>h_player_6_name</th>\n",
" <th>h_player_7_id</th>\n",
" <th>h_player_7_name</th>\n",
" <th>h_player_8_id</th>\n",
" <th>h_player_8_name</th>\n",
" <th>h_player_9_id</th>\n",
" <th>h_player_9_name</th>\n",
" <th>additional_info</th>\n",
" <th>acquisition_info</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>171907</td>\n",
" <td>171907</td>\n",
" <td>171907</td>\n",
" <td>171907</td>\n",
" <td>171907</td>\n",
" <td>140150</td>\n",
" <td>116</td>\n",
" <td>145</td>\n",
" <td>180</td>\n",
" <td>171907</td>\n",
" <td>...</td>\n",
" <td>140838</td>\n",
" <td>140838</td>\n",
" <td>140838</td>\n",
" <td>140838</td>\n",
" <td>140838</td>\n",
" <td>140838</td>\n",
" <td>140838</td>\n",
" <td>140838</td>\n",
" <td>1456</td>\n",
" <td>140841</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>7</td>\n",
" <td>148</td>\n",
" <td>7</td>\n",
" <td>148</td>\n",
" <td>7</td>\n",
" <td>2</td>\n",
" <td>116</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>245</td>\n",
" <td>...</td>\n",
" <td>4774</td>\n",
" <td>4720</td>\n",
" <td>5253</td>\n",
" <td>5197</td>\n",
" <td>4760</td>\n",
" <td>4710</td>\n",
" <td>5193</td>\n",
" <td>5142</td>\n",
" <td>332</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>Sat</td>\n",
" <td>CHN</td>\n",
" <td>NL</td>\n",
" <td>CHN</td>\n",
" <td>NL</td>\n",
" <td>D</td>\n",
" <td>19820711,CHI11,5,5,54</td>\n",
" <td>H</td>\n",
" <td>V</td>\n",
" <td>STL07</td>\n",
" <td>...</td>\n",
" <td>grimc101</td>\n",
" <td>Charlie Grimm</td>\n",
" <td>grimc101</td>\n",
" <td>Charlie Grimm</td>\n",
" <td>lopea102</td>\n",
" <td>Al Lopez</td>\n",
" <td>spahw101</td>\n",
" <td>Warren Spahn</td>\n",
" <td>HTBF</td>\n",
" <td>Y</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>28891</td>\n",
" <td>8870</td>\n",
" <td>88866</td>\n",
" <td>9024</td>\n",
" <td>88867</td>\n",
" <td>82724</td>\n",
" <td>1</td>\n",
" <td>69</td>\n",
" <td>90</td>\n",
" <td>7022</td>\n",
" <td>...</td>\n",
" <td>427</td>\n",
" <td>427</td>\n",
" <td>491</td>\n",
" <td>491</td>\n",
" <td>676</td>\n",
" <td>676</td>\n",
" <td>339</td>\n",
" <td>339</td>\n",
" <td>1112</td>\n",
" <td>140841</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4 rows × 78 columns</p>\n",
"</div>"
],
"text/plain": [
" day_of_week v_name v_league h_name h_league day_night \\\n",
"count 171907 171907 171907 171907 171907 140150 \n",
"unique 7 148 7 148 7 2 \n",
"top Sat CHN NL CHN NL D \n",
"freq 28891 8870 88866 9024 88867 82724 \n",
"\n",
" completion forefeit protest park_id ... \\\n",
"count 116 145 180 171907 ... \n",
"unique 116 3 5 245 ... \n",
"top 19820711,CHI11,5,5,54 H V STL07 ... \n",
"freq 1 69 90 7022 ... \n",
"\n",
" h_player_6_id h_player_6_name h_player_7_id h_player_7_name \\\n",
"count 140838 140838 140838 140838 \n",
"unique 4774 4720 5253 5197 \n",
"top grimc101 Charlie Grimm grimc101 Charlie Grimm \n",
"freq 427 427 491 491 \n",
"\n",
" h_player_8_id h_player_8_name h_player_9_id h_player_9_name \\\n",
"count 140838 140838 140838 140838 \n",
"unique 4760 4710 5193 5142 \n",
"top lopea102 Al Lopez spahw101 Warren Spahn \n",
"freq 676 676 339 339 \n",
"\n",
" additional_info acquisition_info \n",
"count 1456 140841 \n",
"unique 332 1 \n",
"top HTBF Y \n",
"freq 1112 140841 \n",
"\n",
"[4 rows x 78 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gl_obj = gl.select_dtypes(include=['object']).copy()\n",
"gl_obj.describe()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 Thu\n",
"1 Fri\n",
"2 Sat\n",
"3 Mon\n",
"4 Tue\n",
"Name: day_of_week, dtype: object\n",
"0 Thu\n",
"1 Fri\n",
"2 Sat\n",
"3 Mon\n",
"4 Tue\n",
"Name: day_of_week, dtype: category\n",
"Categories (7, object): [Fri, Mon, Sat, Sun, Thu, Tue, Wed]\n"
]
}
],
"source": [
"dow = gl_obj.day_of_week\n",
"print(dow.head())\n",
"\n",
"dow_cat = dow.astype('category')\n",
"print(dow_cat.head())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 4\n",
"1 0\n",
"2 2\n",
"3 1\n",
"4 5\n",
"dtype: int8"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dow_cat.head().cat.codes\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7.00 MB\n",
"0.00 MB\n"
]
}
],
"source": [
"print(mem_usage(dow))\n",
"print(mem_usage(dow_cat))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"629.00 MB\n",
"48.00 MB\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Exception TypeError: 'data type not understood' in 'pandas._libs.lib.array_equivalent_object' ignored\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>before</th>\n",
" <th>after</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>object</th>\n",
" <td>78.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>category</th>\n",
" <td>NaN</td>\n",
" <td>78.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" before after\n",
"object 78.0 NaN\n",
"category NaN 78.0"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"converted_obj = pd.DataFrame()\n",
"\n",
"for col in gl_obj.columns:\n",
" num_unique_values = len(gl_obj[col].unique())\n",
" num_total_values = len(gl_obj[col])\n",
" if num_unique_values / num_total_values < 0.5:\n",
" converted_obj.loc[:,col] = gl_obj[col].astype('category')\n",
" else:\n",
" converted_obj.loc[:,col] = gl_obj[col]\n",
"print(mem_usage(gl_obj))\n",
"print(mem_usage(converted_obj))\n",
"\n",
"compare_obj = pd.concat([gl_obj.dtypes,converted_obj.dtypes],axis=1)\n",
"compare_obj.columns = ['before','after']\n",
"compare_obj.apply(pd.Series.value_counts)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment