Skip to content

Instantly share code, notes, and snippets.

@pavithraes
Last active April 29, 2022 15:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pavithraes/546d3d35165d2925be75f53f0387008d to your computer and use it in GitHub Desktop.
Save pavithraes/546d3d35165d2925be75f53f0387008d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b92238fb-9019-4d33-b5f6-827d639572f1",
"metadata": {},
"outputs": [],
"source": [
"import dask.dataframe as dd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4d7014ee-89df-44e1-9181-61b23acbee78",
"metadata": {},
"outputs": [],
"source": [
"dtype = {\n",
" 'IMD_Decile_From_LSOA': 'object',\n",
" 'Age_Band': 'object',\n",
" 'Sex': 'object',\n",
" 'AE_Arrive_Date': 'object',\n",
" 'AE_Arrive_HourOfDay': 'object',\n",
" 'AE_Time_Mins': 'object',\n",
" 'AE_HRG': 'object',\n",
" 'AE_Num_Diagnoses': 'object',\n",
" 'AE_Num_Investigations': 'object',\n",
" 'AE_Num_Treatments': 'object',\n",
" 'AE_Arrival_Mode': 'object',\n",
" 'Provider_Patient_Distance_Miles': 'object',\n",
" 'ProvID': 'object',\n",
" 'Admitted_Flag': 'object',\n",
" 'Admission_Method': 'object',\n",
" 'ICD10_Chapter_Code': 'object',\n",
" 'Treatment_Function_Code': 'object',\n",
" 'Length_Of_Stay_Days': 'object',\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1a54b556-73ea-4f47-9d7b-e2a650d92f75",
"metadata": {},
"outputs": [],
"source": [
"ddf1 = dd.read_csv(\"data/AESyntheticData.csv\", dtype=dtype)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "dd5094b0-d350-4e4f-b940-87c3723a1159",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>IMD_Decile_From_LSOA</th>\n",
" <th>Age_Band</th>\n",
" <th>Sex</th>\n",
" <th>AE_Arrive_Date</th>\n",
" <th>AE_Arrive_HourOfDay</th>\n",
" <th>AE_Time_Mins</th>\n",
" <th>AE_HRG</th>\n",
" <th>AE_Num_Diagnoses</th>\n",
" <th>AE_Num_Investigations</th>\n",
" <th>AE_Num_Treatments</th>\n",
" <th>AE_Arrival_Mode</th>\n",
" <th>Provider_Patient_Distance_Miles</th>\n",
" <th>ProvID</th>\n",
" <th>Admitted_Flag</th>\n",
" <th>Admission_Method</th>\n",
" <th>ICD10_Chapter_Code</th>\n",
" <th>Treatment_Function_Code</th>\n",
" <th>Length_Of_Stay_Days</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>65-84</td>\n",
" <td>1</td>\n",
" <td>2015-07-02 00:00:00</td>\n",
" <td>17-20</td>\n",
" <td>210</td>\n",
" <td>High</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>15207</td>\n",
" <td>1</td>\n",
" <td>21</td>\n",
" <td>XVIII</td>\n",
" <td>180</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>18-24</td>\n",
" <td>1</td>\n",
" <td>2017-05-31 00:00:00</td>\n",
" <td>17-20</td>\n",
" <td>20</td>\n",
" <td>Low</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>15321</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>65-84</td>\n",
" <td>2</td>\n",
" <td>2015-10-25 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>280</td>\n",
" <td>Nothing</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>15269</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>25-44</td>\n",
" <td>1</td>\n",
" <td>2014-06-16 00:00:00</td>\n",
" <td>21-24</td>\n",
" <td>150</td>\n",
" <td>Low</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>15239</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>7</td>\n",
" <td>18-24</td>\n",
" <td>2</td>\n",
" <td>2017-11-18 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>180</td>\n",
" <td>Low</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>15286</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IMD_Decile_From_LSOA Age_Band Sex AE_Arrive_Date AE_Arrive_HourOfDay \\\n",
"0 2 65-84 1 2015-07-02 00:00:00 17-20 \n",
"1 6 18-24 1 2017-05-31 00:00:00 17-20 \n",
"2 2 65-84 2 2015-10-25 00:00:00 13-16 \n",
"3 4 25-44 1 2014-06-16 00:00:00 21-24 \n",
"4 7 18-24 2 2017-11-18 00:00:00 13-16 \n",
"\n",
" AE_Time_Mins AE_HRG AE_Num_Diagnoses AE_Num_Investigations \\\n",
"0 210 High 1 5 \n",
"1 20 Low 0 1 \n",
"2 280 Nothing 1 0 \n",
"3 150 Low 1 2 \n",
"4 180 Low 1 1 \n",
"\n",
" AE_Num_Treatments AE_Arrival_Mode Provider_Patient_Distance_Miles ProvID \\\n",
"0 3 2 1 15207 \n",
"1 1 2 5 15321 \n",
"2 1 1 2 15269 \n",
"3 2 2 1 15239 \n",
"4 2 2 1 15286 \n",
"\n",
" Admitted_Flag Admission_Method ICD10_Chapter_Code Treatment_Function_Code \\\n",
"0 1 21 XVIII 180 \n",
"1 0 NaN NaN NaN \n",
"2 0 NaN NaN NaN \n",
"3 0 NaN NaN NaN \n",
"4 0 NaN NaN NaN \n",
"\n",
" Length_Of_Stay_Days \n",
"0 1 \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ddf1.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "000770c6-0c0e-4a95-88a0-6d018a828941",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>3</th>\n",
" <th>45-64</th>\n",
" <th>2</th>\n",
" <th>2016-06-08 00:00:00</th>\n",
" <th>17-20</th>\n",
" <th>100</th>\n",
" <th>Nothing</th>\n",
" <th>1</th>\n",
" <th>1.1</th>\n",
" <th>1.2</th>\n",
" <th>2.1</th>\n",
" <th>1.3</th>\n",
" <th>15371</th>\n",
" <th>0</th>\n",
" <th>Unnamed: 14</th>\n",
" <th>Unnamed: 15</th>\n",
" <th>Unnamed: 16</th>\n",
" <th>Unnamed: 17</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>927216</th>\n",
" <td>7.0</td>\n",
" <td>1-17</td>\n",
" <td>1.0</td>\n",
" <td>2016-03-01 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>170</td>\n",
" <td>Medium</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3.0</td>\n",
" <td>15264</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927217</th>\n",
" <td>4.0</td>\n",
" <td>18-24</td>\n",
" <td>2.0</td>\n",
" <td>2015-03-07 00:00:00</td>\n",
" <td>01-04</td>\n",
" <td>140</td>\n",
" <td>Low</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>15183</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927218</th>\n",
" <td>7.0</td>\n",
" <td>18-24</td>\n",
" <td>2.0</td>\n",
" <td>2014-05-03 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>100</td>\n",
" <td>Nothing</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>4.0</td>\n",
" <td>15206</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927219</th>\n",
" <td>7.0</td>\n",
" <td>45-64</td>\n",
" <td>1.0</td>\n",
" <td>2016-02-11 00:00:00</td>\n",
" <td>09-12</td>\n",
" <td>80</td>\n",
" <td>Medium</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>15187</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927220</th>\n",
" <td>4.0</td>\n",
" <td>1-17</td>\n",
" <td>2.0</td>\n",
" <td>2017-12-04 00:00:00</td>\n",
" <td>17-20</td>\n",
" <td>130</td>\n",
" <td>Nothing</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>15109</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 3 45-64 2 2016-06-08 00:00:00 17-20 100 Nothing 1 1.1 \\\n",
"927216 7.0 1-17 1.0 2016-03-01 00:00:00 13-16 170 Medium 1 3 \n",
"927217 4.0 18-24 2.0 2015-03-07 00:00:00 01-04 140 Low 1 0 \n",
"927218 7.0 18-24 2.0 2014-05-03 00:00:00 13-16 100 Nothing 1 1 \n",
"927219 7.0 45-64 1.0 2016-02-11 00:00:00 09-12 80 Medium 0 1 \n",
"927220 4.0 1-17 2.0 2017-12-04 00:00:00 17-20 130 Nothing 0 1 \n",
"\n",
" 1.2 2.1 1.3 15371 0 Unnamed: 14 Unnamed: 15 Unnamed: 16 \\\n",
"927216 3 1 3.0 15264 0.0 NaN NaN NaN \n",
"927217 1 2 1.0 15183 0.0 NaN NaN NaN \n",
"927218 1 2 4.0 15206 0.0 NaN NaN NaN \n",
"927219 5 2 2.0 15187 0.0 NaN NaN NaN \n",
"927220 1 2 2.0 15109 0.0 NaN NaN NaN \n",
"\n",
" Unnamed: 17 \n",
"927216 NaN \n",
"927217 NaN \n",
"927218 NaN \n",
"927219 NaN \n",
"927220 NaN "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ddf1.tail()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c4f1f865-a090-49a5-a958-9d8689b9fbe5",
"metadata": {},
"outputs": [],
"source": [
"def generalize_timestamp(timestamp):\n",
" return timestamp[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bc1de1ad-3dc3-42c1-a6c1-768bf62cce1b",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# KeyError\n",
"ddf1['AE_Arrive_Date'].apply(generalize_timestamp, meta=('AE_Arrive_Date', 'object')).persist()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c3d073d5-eaef-4591-a9c4-97f7b47e706f",
"metadata": {},
"outputs": [],
"source": [
"ddf2 = dd.read_csv(\"data/AESyntheticData.csv\", header=0, dtype=dtype)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b9835438-3686-420f-bc94-333f8ff676ff",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>IMD_Decile_From_LSOA</th>\n",
" <th>Age_Band</th>\n",
" <th>Sex</th>\n",
" <th>AE_Arrive_Date</th>\n",
" <th>AE_Arrive_HourOfDay</th>\n",
" <th>AE_Time_Mins</th>\n",
" <th>AE_HRG</th>\n",
" <th>AE_Num_Diagnoses</th>\n",
" <th>AE_Num_Investigations</th>\n",
" <th>AE_Num_Treatments</th>\n",
" <th>AE_Arrival_Mode</th>\n",
" <th>Provider_Patient_Distance_Miles</th>\n",
" <th>ProvID</th>\n",
" <th>Admitted_Flag</th>\n",
" <th>Admission_Method</th>\n",
" <th>ICD10_Chapter_Code</th>\n",
" <th>Treatment_Function_Code</th>\n",
" <th>Length_Of_Stay_Days</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>65-84</td>\n",
" <td>1</td>\n",
" <td>2015-07-02 00:00:00</td>\n",
" <td>17-20</td>\n",
" <td>210</td>\n",
" <td>High</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>15207</td>\n",
" <td>1</td>\n",
" <td>21</td>\n",
" <td>XVIII</td>\n",
" <td>180</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>18-24</td>\n",
" <td>1</td>\n",
" <td>2017-05-31 00:00:00</td>\n",
" <td>17-20</td>\n",
" <td>20</td>\n",
" <td>Low</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>15321</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>65-84</td>\n",
" <td>2</td>\n",
" <td>2015-10-25 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>280</td>\n",
" <td>Nothing</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>15269</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>25-44</td>\n",
" <td>1</td>\n",
" <td>2014-06-16 00:00:00</td>\n",
" <td>21-24</td>\n",
" <td>150</td>\n",
" <td>Low</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>15239</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>7</td>\n",
" <td>18-24</td>\n",
" <td>2</td>\n",
" <td>2017-11-18 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>180</td>\n",
" <td>Low</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>15286</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IMD_Decile_From_LSOA Age_Band Sex AE_Arrive_Date AE_Arrive_HourOfDay \\\n",
"0 2 65-84 1 2015-07-02 00:00:00 17-20 \n",
"1 6 18-24 1 2017-05-31 00:00:00 17-20 \n",
"2 2 65-84 2 2015-10-25 00:00:00 13-16 \n",
"3 4 25-44 1 2014-06-16 00:00:00 21-24 \n",
"4 7 18-24 2 2017-11-18 00:00:00 13-16 \n",
"\n",
" AE_Time_Mins AE_HRG AE_Num_Diagnoses AE_Num_Investigations \\\n",
"0 210 High 1 5 \n",
"1 20 Low 0 1 \n",
"2 280 Nothing 1 0 \n",
"3 150 Low 1 2 \n",
"4 180 Low 1 1 \n",
"\n",
" AE_Num_Treatments AE_Arrival_Mode Provider_Patient_Distance_Miles ProvID \\\n",
"0 3 2 1 15207 \n",
"1 1 2 5 15321 \n",
"2 1 1 2 15269 \n",
"3 2 2 1 15239 \n",
"4 2 2 1 15286 \n",
"\n",
" Admitted_Flag Admission_Method ICD10_Chapter_Code Treatment_Function_Code \\\n",
"0 1 21 XVIII 180 \n",
"1 0 NaN NaN NaN \n",
"2 0 NaN NaN NaN \n",
"3 0 NaN NaN NaN \n",
"4 0 NaN NaN NaN \n",
"\n",
" Length_Of_Stay_Days \n",
"0 1 \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ddf2.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e2ec5dd4-7332-40b5-82f0-3fa668777777",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>3</th>\n",
" <th>45-64</th>\n",
" <th>2</th>\n",
" <th>2016-06-08 00:00:00</th>\n",
" <th>17-20</th>\n",
" <th>100</th>\n",
" <th>Nothing</th>\n",
" <th>1</th>\n",
" <th>1.1</th>\n",
" <th>1.2</th>\n",
" <th>2.1</th>\n",
" <th>1.3</th>\n",
" <th>15371</th>\n",
" <th>0</th>\n",
" <th>Unnamed: 14</th>\n",
" <th>Unnamed: 15</th>\n",
" <th>Unnamed: 16</th>\n",
" <th>Unnamed: 17</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>927216</th>\n",
" <td>7.0</td>\n",
" <td>1-17</td>\n",
" <td>1.0</td>\n",
" <td>2016-03-01 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>170</td>\n",
" <td>Medium</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3.0</td>\n",
" <td>15264</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927217</th>\n",
" <td>4.0</td>\n",
" <td>18-24</td>\n",
" <td>2.0</td>\n",
" <td>2015-03-07 00:00:00</td>\n",
" <td>01-04</td>\n",
" <td>140</td>\n",
" <td>Low</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>15183</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927218</th>\n",
" <td>7.0</td>\n",
" <td>18-24</td>\n",
" <td>2.0</td>\n",
" <td>2014-05-03 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>100</td>\n",
" <td>Nothing</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>4.0</td>\n",
" <td>15206</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927219</th>\n",
" <td>7.0</td>\n",
" <td>45-64</td>\n",
" <td>1.0</td>\n",
" <td>2016-02-11 00:00:00</td>\n",
" <td>09-12</td>\n",
" <td>80</td>\n",
" <td>Medium</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>15187</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927220</th>\n",
" <td>4.0</td>\n",
" <td>1-17</td>\n",
" <td>2.0</td>\n",
" <td>2017-12-04 00:00:00</td>\n",
" <td>17-20</td>\n",
" <td>130</td>\n",
" <td>Nothing</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>15109</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 3 45-64 2 2016-06-08 00:00:00 17-20 100 Nothing 1 1.1 \\\n",
"927216 7.0 1-17 1.0 2016-03-01 00:00:00 13-16 170 Medium 1 3 \n",
"927217 4.0 18-24 2.0 2015-03-07 00:00:00 01-04 140 Low 1 0 \n",
"927218 7.0 18-24 2.0 2014-05-03 00:00:00 13-16 100 Nothing 1 1 \n",
"927219 7.0 45-64 1.0 2016-02-11 00:00:00 09-12 80 Medium 0 1 \n",
"927220 4.0 1-17 2.0 2017-12-04 00:00:00 17-20 130 Nothing 0 1 \n",
"\n",
" 1.2 2.1 1.3 15371 0 Unnamed: 14 Unnamed: 15 Unnamed: 16 \\\n",
"927216 3 1 3.0 15264 0.0 NaN NaN NaN \n",
"927217 1 2 1.0 15183 0.0 NaN NaN NaN \n",
"927218 1 2 4.0 15206 0.0 NaN NaN NaN \n",
"927219 5 2 2.0 15187 0.0 NaN NaN NaN \n",
"927220 1 2 2.0 15109 0.0 NaN NaN NaN \n",
"\n",
" Unnamed: 17 \n",
"927216 NaN \n",
"927217 NaN \n",
"927218 NaN \n",
"927219 NaN \n",
"927220 NaN "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ddf2.tail()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "7d47dbf0-2a8d-49d6-ad58-1659284c62ba",
"metadata": {},
"outputs": [],
"source": [
"cols = ['IMD_Decile_From_LSOA',\n",
" 'Age_Band',\n",
" 'Sex',\n",
" 'AE_Arrive_Date',\n",
" 'AE_Arrive_HourOfDay',\n",
" 'AE_Time_Mins',\n",
" 'AE_HRG',\n",
" 'AE_Num_Diagnoses',\n",
" 'AE_Num_Investigations',\n",
" 'AE_Num_Treatments',\n",
" 'AE_Arrival_Mode',\n",
" 'Provider_Patient_Distance_Miles',\n",
" 'ProvID',\n",
" 'Admitted_Flag',\n",
" 'Admission_Method',\n",
" 'ICD10_Chapter_Code',\n",
" 'Treatment_Function_Code',\n",
" 'Length_Of_Stay_Days']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8ca6336c-b7a6-47ca-b327-bb9a9f54b838",
"metadata": {},
"outputs": [],
"source": [
"ddf3 = dd.read_csv(\"data/AESyntheticData.csv\", names=cols, dtype=dtype)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "9ef10303-7a4e-4011-8aa9-c0cb8b65b9a7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>IMD_Decile_From_LSOA</th>\n",
" <th>Age_Band</th>\n",
" <th>Sex</th>\n",
" <th>AE_Arrive_Date</th>\n",
" <th>AE_Arrive_HourOfDay</th>\n",
" <th>AE_Time_Mins</th>\n",
" <th>AE_HRG</th>\n",
" <th>AE_Num_Diagnoses</th>\n",
" <th>AE_Num_Investigations</th>\n",
" <th>AE_Num_Treatments</th>\n",
" <th>AE_Arrival_Mode</th>\n",
" <th>Provider_Patient_Distance_Miles</th>\n",
" <th>ProvID</th>\n",
" <th>Admitted_Flag</th>\n",
" <th>Admission_Method</th>\n",
" <th>ICD10_Chapter_Code</th>\n",
" <th>Treatment_Function_Code</th>\n",
" <th>Length_Of_Stay_Days</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>IMD_Decile_From_LSOA</td>\n",
" <td>Age_Band</td>\n",
" <td>Sex</td>\n",
" <td>AE_Arrive_Date</td>\n",
" <td>AE_Arrive_HourOfDay</td>\n",
" <td>AE_Time_Mins</td>\n",
" <td>AE_HRG</td>\n",
" <td>AE_Num_Diagnoses</td>\n",
" <td>AE_Num_Investigations</td>\n",
" <td>AE_Num_Treatments</td>\n",
" <td>AE_Arrival_Mode</td>\n",
" <td>Provider_Patient_Distance_Miles</td>\n",
" <td>ProvID</td>\n",
" <td>Admitted_Flag</td>\n",
" <td>Admission_Method</td>\n",
" <td>ICD10_Chapter_Code</td>\n",
" <td>Treatment_Function_Code</td>\n",
" <td>Length_Of_Stay_Days</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>65-84</td>\n",
" <td>1</td>\n",
" <td>2015-07-02 00:00:00</td>\n",
" <td>17-20</td>\n",
" <td>210</td>\n",
" <td>High</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>15207</td>\n",
" <td>1</td>\n",
" <td>21</td>\n",
" <td>XVIII</td>\n",
" <td>180</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>18-24</td>\n",
" <td>1</td>\n",
" <td>2017-05-31 00:00:00</td>\n",
" <td>17-20</td>\n",
" <td>20</td>\n",
" <td>Low</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>15321</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>65-84</td>\n",
" <td>2</td>\n",
" <td>2015-10-25 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>280</td>\n",
" <td>Nothing</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>15269</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>25-44</td>\n",
" <td>1</td>\n",
" <td>2014-06-16 00:00:00</td>\n",
" <td>21-24</td>\n",
" <td>150</td>\n",
" <td>Low</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>15239</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IMD_Decile_From_LSOA Age_Band Sex AE_Arrive_Date \\\n",
"0 IMD_Decile_From_LSOA Age_Band Sex AE_Arrive_Date \n",
"1 2 65-84 1 2015-07-02 00:00:00 \n",
"2 6 18-24 1 2017-05-31 00:00:00 \n",
"3 2 65-84 2 2015-10-25 00:00:00 \n",
"4 4 25-44 1 2014-06-16 00:00:00 \n",
"\n",
" AE_Arrive_HourOfDay AE_Time_Mins AE_HRG AE_Num_Diagnoses \\\n",
"0 AE_Arrive_HourOfDay AE_Time_Mins AE_HRG AE_Num_Diagnoses \n",
"1 17-20 210 High 1 \n",
"2 17-20 20 Low 0 \n",
"3 13-16 280 Nothing 1 \n",
"4 21-24 150 Low 1 \n",
"\n",
" AE_Num_Investigations AE_Num_Treatments AE_Arrival_Mode \\\n",
"0 AE_Num_Investigations AE_Num_Treatments AE_Arrival_Mode \n",
"1 5 3 2 \n",
"2 1 1 2 \n",
"3 0 1 1 \n",
"4 2 2 2 \n",
"\n",
" Provider_Patient_Distance_Miles ProvID Admitted_Flag Admission_Method \\\n",
"0 Provider_Patient_Distance_Miles ProvID Admitted_Flag Admission_Method \n",
"1 1 15207 1 21 \n",
"2 5 15321 0 NaN \n",
"3 2 15269 0 NaN \n",
"4 1 15239 0 NaN \n",
"\n",
" ICD10_Chapter_Code Treatment_Function_Code Length_Of_Stay_Days \n",
"0 ICD10_Chapter_Code Treatment_Function_Code Length_Of_Stay_Days \n",
"1 XVIII 180 1 \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ddf3.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "89850644-abb6-45a8-822b-6d4ad7e1e4d2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>IMD_Decile_From_LSOA</th>\n",
" <th>Age_Band</th>\n",
" <th>Sex</th>\n",
" <th>AE_Arrive_Date</th>\n",
" <th>AE_Arrive_HourOfDay</th>\n",
" <th>AE_Time_Mins</th>\n",
" <th>AE_HRG</th>\n",
" <th>AE_Num_Diagnoses</th>\n",
" <th>AE_Num_Investigations</th>\n",
" <th>AE_Num_Treatments</th>\n",
" <th>AE_Arrival_Mode</th>\n",
" <th>Provider_Patient_Distance_Miles</th>\n",
" <th>ProvID</th>\n",
" <th>Admitted_Flag</th>\n",
" <th>Admission_Method</th>\n",
" <th>ICD10_Chapter_Code</th>\n",
" <th>Treatment_Function_Code</th>\n",
" <th>Length_Of_Stay_Days</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>927217</th>\n",
" <td>7</td>\n",
" <td>1-17</td>\n",
" <td>1</td>\n",
" <td>2016-03-01 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>170</td>\n",
" <td>Medium</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>15264</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927218</th>\n",
" <td>4</td>\n",
" <td>18-24</td>\n",
" <td>2</td>\n",
" <td>2015-03-07 00:00:00</td>\n",
" <td>01-04</td>\n",
" <td>140</td>\n",
" <td>Low</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>15183</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927219</th>\n",
" <td>7</td>\n",
" <td>18-24</td>\n",
" <td>2</td>\n",
" <td>2014-05-03 00:00:00</td>\n",
" <td>13-16</td>\n",
" <td>100</td>\n",
" <td>Nothing</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>15206</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927220</th>\n",
" <td>7</td>\n",
" <td>45-64</td>\n",
" <td>1</td>\n",
" <td>2016-02-11 00:00:00</td>\n",
" <td>09-12</td>\n",
" <td>80</td>\n",
" <td>Medium</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>15187</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>927221</th>\n",
" <td>4</td>\n",
" <td>1-17</td>\n",
" <td>2</td>\n",
" <td>2017-12-04 00:00:00</td>\n",
" <td>17-20</td>\n",
" <td>130</td>\n",
" <td>Nothing</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>15109</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" IMD_Decile_From_LSOA Age_Band Sex AE_Arrive_Date \\\n",
"927217 7 1-17 1 2016-03-01 00:00:00 \n",
"927218 4 18-24 2 2015-03-07 00:00:00 \n",
"927219 7 18-24 2 2014-05-03 00:00:00 \n",
"927220 7 45-64 1 2016-02-11 00:00:00 \n",
"927221 4 1-17 2 2017-12-04 00:00:00 \n",
"\n",
" AE_Arrive_HourOfDay AE_Time_Mins AE_HRG AE_Num_Diagnoses \\\n",
"927217 13-16 170 Medium 1 \n",
"927218 01-04 140 Low 1 \n",
"927219 13-16 100 Nothing 1 \n",
"927220 09-12 80 Medium 0 \n",
"927221 17-20 130 Nothing 0 \n",
"\n",
" AE_Num_Investigations AE_Num_Treatments AE_Arrival_Mode \\\n",
"927217 3 3 1 \n",
"927218 0 1 2 \n",
"927219 1 1 2 \n",
"927220 1 5 2 \n",
"927221 1 1 2 \n",
"\n",
" Provider_Patient_Distance_Miles ProvID Admitted_Flag Admission_Method \\\n",
"927217 3 15264 0 NaN \n",
"927218 1 15183 0 NaN \n",
"927219 4 15206 0 NaN \n",
"927220 2 15187 0 NaN \n",
"927221 2 15109 0 NaN \n",
"\n",
" ICD10_Chapter_Code Treatment_Function_Code Length_Of_Stay_Days \n",
"927217 NaN NaN NaN \n",
"927218 NaN NaN NaN \n",
"927219 NaN NaN NaN \n",
"927220 NaN NaN NaN \n",
"927221 NaN NaN NaN "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ddf3.tail()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "feb7b946-b8b1-4316-b2ab-f371ca394457",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dask Series Structure:\n",
"npartitions=71\n",
" object\n",
" ...\n",
" ... \n",
" ...\n",
" ...\n",
"Name: AE_Arrive_Date, dtype: object\n",
"Dask Name: apply, 71 tasks"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Works\n",
"ddf3['AE_Arrive_Date'].apply(generalize_timestamp, meta=('AE_Arrive_Date', 'object')).persist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94c50f04-265b-4972-bfae-d08fb62cdab6",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment