Created
September 7, 2014 00:55
-
-
Save RodrigoPrior/801c4b624b6b1ac7bb7c to your computer and use it in GitHub Desktop.
Tutorial: Pandas Dataframe to Numpy Array and store in HDF5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:5b983a9dd0e6622c0cfd7e27dd8bd2849f702d86ec3e81e11ff909c670a32ac4" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Tutorial: Pandas Dataframe to Numpy Array and store in HDF5 \n", | |
"\n", | |
"Convert a pandas dataframe in a numpy array, store data in a file HDF5 and return as numpy array or dataframe. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import h5py" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 108 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"np.random.seed(1234)\n", | |
"df = pd.DataFrame(np.random.randn(6,4),columns=list('ABCD'))\n", | |
"df" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" <th>D</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 0.471435</td>\n", | |
" <td>-1.190976</td>\n", | |
" <td> 1.432707</td>\n", | |
" <td>-0.312652</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>-0.720589</td>\n", | |
" <td> 0.887163</td>\n", | |
" <td> 0.859588</td>\n", | |
" <td>-0.636524</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 0.015696</td>\n", | |
" <td>-2.242685</td>\n", | |
" <td> 1.150036</td>\n", | |
" <td> 0.991946</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 0.953324</td>\n", | |
" <td>-2.021255</td>\n", | |
" <td>-0.334077</td>\n", | |
" <td> 0.002118</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 0.405453</td>\n", | |
" <td> 0.289092</td>\n", | |
" <td> 1.321158</td>\n", | |
" <td>-1.546906</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>-0.202646</td>\n", | |
" <td>-0.655969</td>\n", | |
" <td> 0.193421</td>\n", | |
" <td> 0.553439</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>6 rows \u00d7 4 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 109, | |
"text": [ | |
" A B C D\n", | |
"0 0.471435 -1.190976 1.432707 -0.312652\n", | |
"1 -0.720589 0.887163 0.859588 -0.636524\n", | |
"2 0.015696 -2.242685 1.150036 0.991946\n", | |
"3 0.953324 -2.021255 -0.334077 0.002118\n", | |
"4 0.405453 0.289092 1.321158 -1.546906\n", | |
"5 -0.202646 -0.655969 0.193421 0.553439\n", | |
"\n", | |
"[6 rows x 4 columns]" | |
] | |
} | |
], | |
"prompt_number": 109 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.as_matrix.html#pandas.DataFrame.as_matrix\n", | |
"df.as_matrix()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 110, | |
"text": [ | |
"array([[ 4.71435164e-01, -1.19097569e+00, 1.43270697e+00,\n", | |
" -3.12651896e-01],\n", | |
" [ -7.20588733e-01, 8.87162940e-01, 8.59588414e-01,\n", | |
" -6.36523504e-01],\n", | |
" [ 1.56963721e-02, -2.24268495e+00, 1.15003572e+00,\n", | |
" 9.91946022e-01],\n", | |
" [ 9.53324128e-01, -2.02125482e+00, -3.34077366e-01,\n", | |
" 2.11836468e-03],\n", | |
" [ 4.05453412e-01, 2.89091941e-01, 1.32115819e+00,\n", | |
" -1.54690555e+00],\n", | |
" [ -2.02646325e-01, -6.55969344e-01, 1.93421376e-01,\n", | |
" 5.53438911e-01]])" | |
] | |
} | |
], | |
"prompt_number": 110 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# http://stackoverflow.com/questions/13187778/pandas-dataframe-to-numpy-array-include-index\n", | |
"# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.to_records.html?highlight=to_record#pandas.DataFrame.to_records\n", | |
"df_to_nparray = df.to_records(index=False)\n", | |
"df_to_nparray" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 111, | |
"text": [ | |
"rec.array([ (0.47143516373249306, -1.1909756947064645, 1.4327069684260973, -0.3126518960917129),\n", | |
" (-0.7205887333650116, 0.8871629403077386, 0.8595884137174165, -0.6365235044173491),\n", | |
" (0.015696372114428918, -2.2426849541854055, 1.150035724719818, 0.9919460223426778),\n", | |
" (0.9533241281124304, -2.0212548201949705, -0.334077365808097, 0.002118364683486495),\n", | |
" (0.405453411570191, 0.2890919409800353, 1.3211581921293856, -1.5469055532292402),\n", | |
" (-0.2026463246291819, -0.6559693441389339, 0.19342137647035826, 0.5534389109567419)], \n", | |
" dtype=[('A', '<f8'), ('B', '<f8'), ('C', '<f8'), ('D', '<f8')])" | |
] | |
} | |
], | |
"prompt_number": 111 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# http://docs.h5py.org/en/latest/high/file.html\n", | |
"# http://blog.tremily.us/posts/HDF5/\n", | |
"# http://www.sam.math.ethz.ch/~raoulb/teaching/PythonTutorial/data_storage.html\n", | |
"\n", | |
"# initialize file\n", | |
"# 'a' -> Read/write if exists, create otherwise (default)\n", | |
"f = h5py.File('tuto_myfile.hdf5','a')\n", | |
"\n", | |
"# create dataset\n", | |
"f['dset'] = df_to_nparray\n", | |
"\n", | |
"# close connection to file\n", | |
"f.close()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 112 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"In order to evaluate the HDF5 file you should install 'hdf5-tools'. \n", | |
"\n", | |
"In Ubuntu system:\n", | |
"\n", | |
" $ sudo apt-get install hdf5-tools\n", | |
" \n", | |
"And try:\n", | |
"\n", | |
" $ h5dump tuto_myfile.hdf5\n", | |
"\n", | |
"You'll retrieve something like:\n", | |
"\n", | |
" $ h5dump tuto_myfile.hdf5 \n", | |
" HDF5 \"tuto_myfile.hdf5\" {\n", | |
" GROUP \"/\" {\n", | |
" DATASET \"dset\" {\n", | |
" DATATYPE H5T_COMPOUND {\n", | |
" H5T_IEEE_F64LE \"A\";\n", | |
" H5T_IEEE_F64LE \"B\";\n", | |
" H5T_IEEE_F64LE \"C\";\n", | |
" H5T_IEEE_F64LE \"D\";\n", | |
" }\n", | |
" DATASPACE SIMPLE { ( 6 ) / ( 6 ) }\n", | |
" DATA {\n", | |
" (0): {\n", | |
" 0.471435,\n", | |
" -1.19098,\n", | |
" 1.43271,\n", | |
" -0.312652\n", | |
" },\n", | |
" (1): {\n", | |
" -0.720589,\n", | |
" 0.887163,\n", | |
" 0.859588,\n", | |
" -0.636524\n", | |
" },\n", | |
" (2): {\n", | |
" 0.0156964,\n", | |
" -2.24268,\n", | |
" 1.15004,\n", | |
" 0.991946\n", | |
" },\n", | |
" (3): {\n", | |
" 0.953324,\n", | |
" -2.02125,\n", | |
" -0.334077,\n", | |
" 0.00211836\n", | |
" },\n", | |
" (4): {\n", | |
" 0.405453,\n", | |
" 0.289092,\n", | |
" 1.32116,\n", | |
" -1.54691\n", | |
" },\n", | |
" (5): {\n", | |
" -0.202646,\n", | |
" -0.655969,\n", | |
" 0.193421,\n", | |
" 0.553439\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" }\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# read from hdf5\n", | |
"\n", | |
"# open file\n", | |
"# 'r' -> Readonly, file must exist\n", | |
"f = h5py.File('tuto_myfile.hdf5', 'r')\n", | |
"\n", | |
"# load dataset: dset\n", | |
"dset = f['dset']\n", | |
"dset" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 113, | |
"text": [ | |
"<HDF5 dataset \"dset\": shape (6,), type \"|V32\">" | |
] | |
} | |
], | |
"prompt_number": 113 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"a = dset[...]\n", | |
"f.close()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 114 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"a" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 115, | |
"text": [ | |
"array([ (0.47143516373249306, -1.1909756947064645, 1.4327069684260973, -0.3126518960917129),\n", | |
" (-0.7205887333650116, 0.8871629403077386, 0.8595884137174165, -0.6365235044173491),\n", | |
" (0.015696372114428918, -2.2426849541854055, 1.150035724719818, 0.9919460223426778),\n", | |
" (0.9533241281124304, -2.0212548201949705, -0.334077365808097, 0.002118364683486495),\n", | |
" (0.405453411570191, 0.2890919409800353, 1.3211581921293856, -1.5469055532292402),\n", | |
" (-0.2026463246291819, -0.6559693441389339, 0.19342137647035826, 0.5534389109567419)], \n", | |
" dtype=[('A', '<f8'), ('B', '<f8'), ('C', '<f8'), ('D', '<f8')])" | |
] | |
} | |
], | |
"prompt_number": 115 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# http://pandas.pydata.org/pandas-docs/dev/io.html#hdf5-pytables\n", | |
"\n", | |
"# Reading hdf5 in pandas\n", | |
"df2 = pd.read_hdf('tuto_myfile.hdf5', 'dset')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 116 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"df2" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>A</th>\n", | |
" <th>B</th>\n", | |
" <th>C</th>\n", | |
" <th>D</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 0.471435</td>\n", | |
" <td>-1.190976</td>\n", | |
" <td> 1.432707</td>\n", | |
" <td>-0.312652</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>-0.720589</td>\n", | |
" <td> 0.887163</td>\n", | |
" <td> 0.859588</td>\n", | |
" <td>-0.636524</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 0.015696</td>\n", | |
" <td>-2.242685</td>\n", | |
" <td> 1.150036</td>\n", | |
" <td> 0.991946</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 0.953324</td>\n", | |
" <td>-2.021255</td>\n", | |
" <td>-0.334077</td>\n", | |
" <td> 0.002118</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 0.405453</td>\n", | |
" <td> 0.289092</td>\n", | |
" <td> 1.321158</td>\n", | |
" <td>-1.546906</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>-0.202646</td>\n", | |
" <td>-0.655969</td>\n", | |
" <td> 0.193421</td>\n", | |
" <td> 0.553439</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>6 rows \u00d7 4 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 117, | |
"text": [ | |
" A B C D\n", | |
"0 0.471435 -1.190976 1.432707 -0.312652\n", | |
"1 -0.720589 0.887163 0.859588 -0.636524\n", | |
"2 0.015696 -2.242685 1.150036 0.991946\n", | |
"3 0.953324 -2.021255 -0.334077 0.002118\n", | |
"4 0.405453 0.289092 1.321158 -1.546906\n", | |
"5 -0.202646 -0.655969 0.193421 0.553439\n", | |
"\n", | |
"[6 rows x 4 columns]" | |
] | |
} | |
], | |
"prompt_number": 117 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# cleanup the mess (comment if needed)\n", | |
"! rm -f tuto_myfile.hdf5" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 118 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"ps.: I know, I know...pandas can store directly in HDF5: http://pandas.pydata.org/pandas-docs/dev/io.html#io-hdf5 \n", | |
" \n", | |
";)" | |
] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment