Skip to content

Instantly share code, notes, and snippets.

@cchwala
Created October 22, 2018 13:22
Show Gist options
  • Save cchwala/eb7301470460b115c6acd43e71dafff8 to your computer and use it in GitHub Desktop.
Save cchwala/eb7301470460b115c6acd43e71dafff8 to your computer and use it in GitHub Desktop.
Manually read in one CML from cmlh5 to dask Dataframe
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import h5py\n",
"import dask.dataframe as dd\n",
"import dask.array as da"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><strong>Dask DataFrame Structure:</strong></div>\n",
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rx</th>\n",
" <th>tx</th>\n",
" </tr>\n",
" <tr>\n",
" <th>npartitions=32</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2017-08-07 13:07:18.261913088</th>\n",
" <td>float64</td>\n",
" <td>float64</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-08 07:10:18.283332096</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-31 05:05:18.282693888</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-31 22:35:18.225156096</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
"<div>Dask Name: set_index, 419 tasks</div>"
],
"text/plain": [
"Dask DataFrame Structure:\n",
" rx tx\n",
"npartitions=32 \n",
"2017-08-07 13:07:18.261913088 float64 float64\n",
"2017-08-08 07:10:18.283332096 ... ...\n",
"... ... ...\n",
"2017-08-31 05:05:18.282693888 ... ...\n",
"2017-08-31 22:35:18.225156096 ... ...\n",
"Dask Name: set_index, 419 tasks"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"h5_reader = h5py.File('/pd/data/regclim_data/raw/cml/ericsson_tn_monthly_2017_2018/cmls_2017_08.h5', \n",
" mode='r')\n",
"cml_ids = h5_reader['/'].keys()\n",
"channels = h5_reader['/'][cml_ids[0]].keys()\n",
"\n",
"# Link to data in HDF5 file\n",
"rx = h5_reader['/'][cml_ids[0]][channels[0]]['rx']\n",
"tx = h5_reader['/'][cml_ids[0]][channels[0]]['tx']\n",
"time = h5_reader['/'][cml_ids[0]][channels[0]]['time']\n",
"\n",
"# Concatenate into DaskDataframe\n",
"ddf = dd.from_dask_array(\n",
" da.stack([\n",
" da.from_array(rx, chunks=rx.chunks),\n",
" da.from_array(tx, chunks=tx.chunks),\n",
" da.from_array(time, chunks=time.chunks)], \n",
" axis=1,\n",
" ),\n",
" columns=['rx', 'tx', 'time']\n",
")\n",
"\n",
"# Cast to correct time representation and set time as index\n",
"ddf.time = (ddf.time * 1e9).astype('M8[ns]')\n",
"ddf = ddf.set_index('time', sorted=True)\n",
"ddf"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"ddf['txrx'] = ddf.tx - ddf.rx"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><strong>Dask DataFrame Structure:</strong></div>\n",
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rx</th>\n",
" <th>tx</th>\n",
" <th>txrx</th>\n",
" </tr>\n",
" <tr>\n",
" <th>npartitions=32</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2017-08-07 13:07:18.261913088</th>\n",
" <td>float64</td>\n",
" <td>float64</td>\n",
" <td>float64</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-08 07:10:18.283332096</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-31 05:05:18.282693888</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-31 22:35:18.225156096</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>\n",
"<div>Dask Name: assign, 547 tasks</div>"
],
"text/plain": [
"Dask DataFrame Structure:\n",
" rx tx txrx\n",
"npartitions=32 \n",
"2017-08-07 13:07:18.261913088 float64 float64 float64\n",
"2017-08-08 07:10:18.283332096 ... ... ...\n",
"... ... ... ...\n",
"2017-08-31 05:05:18.282693888 ... ... ...\n",
"2017-08-31 22:35:18.225156096 ... ... ...\n",
"Dask Name: assign, 547 tasks"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ddf"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rx</th>\n",
" <th>tx</th>\n",
" <th>txrx</th>\n",
" </tr>\n",
" <tr>\n",
" <th>time</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2017-08-07 13:07:18.261913088</th>\n",
" <td>-39.8</td>\n",
" <td>16.0</td>\n",
" <td>55.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-07 13:08:18.294920960</th>\n",
" <td>-40.1</td>\n",
" <td>16.0</td>\n",
" <td>56.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-07 13:09:18.280340992</th>\n",
" <td>-40.1</td>\n",
" <td>16.0</td>\n",
" <td>56.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-07 13:10:18.248403968</th>\n",
" <td>-39.8</td>\n",
" <td>16.0</td>\n",
" <td>55.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2017-08-07 13:11:18.248102912</th>\n",
" <td>-39.8</td>\n",
" <td>16.0</td>\n",
" <td>55.8</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" rx tx txrx\n",
"time \n",
"2017-08-07 13:07:18.261913088 -39.8 16.0 55.8\n",
"2017-08-07 13:08:18.294920960 -40.1 16.0 56.1\n",
"2017-08-07 13:09:18.280340992 -40.1 16.0 56.1\n",
"2017-08-07 13:10:18.248403968 -39.8 16.0 55.8\n",
"2017-08-07 13:11:18.248102912 -39.8 16.0 55.8"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ddf.compute().head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"|"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment