dotsdl/json-v-pytables.ipynb

## json-v-pytables.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import datreant as dtr\n",
    "import shutil\n",
    "import time\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have recently built a JSON backend for `TreantFiles`, which on a few quick benchmarks gives comparable performance to the PyTables/HDF5 backend for most common uses. Even though the JSON format slows down the more pieces of metadata it accumulates, it compares favorably in the range of use we expect with Treants without the limitations of PyTables, such as fixed-size strings, lack of unicode support, etc. It also allows flexibility that PyTables does not make easy, such as allowing non-string (numerical) tags and categories."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Making a Group and many member Treants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def create_big_group(directory, backend):\n",
    "    if os.path.exists(directory):\n",
    "        shutil.rmtree(directory)\n",
    "        \n",
    "    start = time.time()\n",
    "    \n",
    "    g = dtr.Group(os.path.join(directory, 'forest'), backend=backend)\n",
    "    for i in range(100):\n",
    "        g.members.add(dtr.Treant(os.path.join(directory, 'shrub_{}'.format(i)), backend=backend))\n",
    "        \n",
    "    dt = time.time() - start\n",
    "    \n",
    "    return dt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    3.000000\n",
       "mean     0.635660\n",
       "std      0.072950\n",
       "min      0.564949\n",
       "25%      0.598160\n",
       "50%      0.631370\n",
       "75%      0.671015\n",
       "max      0.710660\n",
       "dtype: float64"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series([create_big_group('testtreants_json', 'json') for i in range(3)]).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    3.000000\n",
       "mean     2.237660\n",
       "std      0.035931\n",
       "min      2.207623\n",
       "25%      2.217758\n",
       "50%      2.227892\n",
       "75%      2.252679\n",
       "max      2.277466\n",
       "dtype: float64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series([create_big_group('testtreants_hdf5', 'pytables') for i in range(3)]).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Adding a few tags to many Treants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def add_tags_to_members(groupfile):\n",
    "        \n",
    "    start = time.time()\n",
    "    \n",
    "    g = dtr.Group(groupfile)\n",
    "    for member in g.members:\n",
    "        member.tags.add('something', 2, 'five')\n",
    "        \n",
    "    dt = time.time() - start\n",
    "    \n",
    "    for member in g.members:\n",
    "        member.tags.remove('something', 2, 'five')\n",
    "    \n",
    "    return dt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    3.000000\n",
       "mean     0.580425\n",
       "std      0.090424\n",
       "min      0.521217\n",
       "25%      0.528382\n",
       "50%      0.535548\n",
       "75%      0.610028\n",
       "max      0.684509\n",
       "dtype: float64"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series([add_tags_to_members('testtreants_json/forest/') for i in range(3)]).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    3.000000\n",
       "mean     1.623631\n",
       "std      0.033853\n",
       "min      1.585260\n",
       "25%      1.610805\n",
       "50%      1.636350\n",
       "75%      1.642816\n",
       "max      1.649282\n",
       "dtype: float64"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series([add_tags_to_members('testtreants_hdf5/forest/') for i in range(3)]).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add a few categories to many Treants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def add_cats_to_members(groupfile):\n",
    "        \n",
    "    start = time.time()\n",
    "    \n",
    "    g = dtr.Group(groupfile)\n",
    "    for i, member in enumerate(g.members):\n",
    "        member.categories.add(myself=i, my_neighbor=i+1)\n",
    " \n",
    "    dt = time.time() - start\n",
    "    \n",
    "    for member in g.members:\n",
    "        member.categories.remove('myself', 'my_neighbor')\n",
    "    \n",
    "    return dt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    3.000000\n",
       "mean     0.610465\n",
       "std      0.050423\n",
       "min      0.571738\n",
       "25%      0.581958\n",
       "50%      0.592178\n",
       "75%      0.629829\n",
       "max      0.667480\n",
       "dtype: float64"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series([add_cats_to_members('testtreants_json/forest/') for i in range(3)]).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    3.000000\n",
       "mean     1.632123\n",
       "std      0.060217\n",
       "min      1.579726\n",
       "25%      1.599231\n",
       "50%      1.618736\n",
       "75%      1.658321\n",
       "max      1.697906\n",
       "dtype: float64"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series([add_cats_to_members('testtreants_hdf5/forest/') for i in range(3)]).describe()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python2",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 32,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import datreant as dtr\n",
	"import shutil\n",
	"import time\n",
	"import os\n",
	"import pandas as pd"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We have recently built a JSON backend for `TreantFiles`, which on a few quick benchmarks gives comparable performance to the PyTables/HDF5 backend for most common uses. Even though the JSON format slows down the more pieces of metadata it accumulates, it compares favorably in the range of use we expect with Treants without the limitations of PyTables, such as fixed-size strings, lack of unicode support, etc. It also allows flexibility that PyTables does not make easy, such as allowing non-string (numerical) tags and categories."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Making a Group and many member Treants"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def create_big_group(directory, backend):\n",
	" if os.path.exists(directory):\n",
	" shutil.rmtree(directory)\n",
	" \n",
	" start = time.time()\n",
	" \n",
	" g = dtr.Group(os.path.join(directory, 'forest'), backend=backend)\n",
	" for i in range(100):\n",
	" g.members.add(dtr.Treant(os.path.join(directory, 'shrub_{}'.format(i)), backend=backend))\n",
	" \n",
	" dt = time.time() - start\n",
	" \n",
	" return dt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"count 3.000000\n",
	"mean 0.635660\n",
	"std 0.072950\n",
	"min 0.564949\n",
	"25% 0.598160\n",
	"50% 0.631370\n",
	"75% 0.671015\n",
	"max 0.710660\n",
	"dtype: float64"
	]
	},
	"execution_count": 37,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.Series([create_big_group('testtreants_json', 'json') for i in range(3)]).describe()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"count 3.000000\n",
	"mean 2.237660\n",
	"std 0.035931\n",
	"min 2.207623\n",
	"25% 2.217758\n",
	"50% 2.227892\n",
	"75% 2.252679\n",
	"max 2.277466\n",
	"dtype: float64"
	]
	},
	"execution_count": 36,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.Series([create_big_group('testtreants_hdf5', 'pytables') for i in range(3)]).describe()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Adding a few tags to many Treants"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def add_tags_to_members(groupfile):\n",
	" \n",
	" start = time.time()\n",
	" \n",
	" g = dtr.Group(groupfile)\n",
	" for member in g.members:\n",
	" member.tags.add('something', 2, 'five')\n",
	" \n",
	" dt = time.time() - start\n",
	" \n",
	" for member in g.members:\n",
	" member.tags.remove('something', 2, 'five')\n",
	" \n",
	" return dt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"count 3.000000\n",
	"mean 0.580425\n",
	"std 0.090424\n",
	"min 0.521217\n",
	"25% 0.528382\n",
	"50% 0.535548\n",
	"75% 0.610028\n",
	"max 0.684509\n",
	"dtype: float64"
	]
	},
	"execution_count": 43,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.Series([add_tags_to_members('testtreants_json/forest/') for i in range(3)]).describe()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"count 3.000000\n",
	"mean 1.623631\n",
	"std 0.033853\n",
	"min 1.585260\n",
	"25% 1.610805\n",
	"50% 1.636350\n",
	"75% 1.642816\n",
	"max 1.649282\n",
	"dtype: float64"
	]
	},
	"execution_count": 47,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.Series([add_tags_to_members('testtreants_hdf5/forest/') for i in range(3)]).describe()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Add a few categories to many Treants"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 48,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def add_cats_to_members(groupfile):\n",
	" \n",
	" start = time.time()\n",
	" \n",
	" g = dtr.Group(groupfile)\n",
	" for i, member in enumerate(g.members):\n",
	" member.categories.add(myself=i, my_neighbor=i+1)\n",
	" \n",
	" dt = time.time() - start\n",
	" \n",
	" for member in g.members:\n",
	" member.categories.remove('myself', 'my_neighbor')\n",
	" \n",
	" return dt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"count 3.000000\n",
	"mean 0.610465\n",
	"std 0.050423\n",
	"min 0.571738\n",
	"25% 0.581958\n",
	"50% 0.592178\n",
	"75% 0.629829\n",
	"max 0.667480\n",
	"dtype: float64"
	]
	},
	"execution_count": 49,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.Series([add_cats_to_members('testtreants_json/forest/') for i in range(3)]).describe()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"count 3.000000\n",
	"mean 1.632123\n",
	"std 0.060217\n",
	"min 1.579726\n",
	"25% 1.599231\n",
	"50% 1.618736\n",
	"75% 1.658321\n",
	"max 1.697906\n",
	"dtype: float64"
	]
	},
	"execution_count": 50,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pd.Series([add_cats_to_members('testtreants_hdf5/forest/') for i in range(3)]).describe()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python2",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}