bmabey/joblib_numpy_memoziation_bug.ipynb

## joblib_numpy_memoziation_bug.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# joblib numpy pickle memoization bug"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AssertionError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-becd01458d06>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mwith_reuse\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mwithout_reuse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mjl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhash\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwith_reuse\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mjl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhash\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwithout_reuse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mAssertionError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "import joblib as jl\n",
    "import numpy as np\n",
    "\n",
    "num = np.int64(10)\n",
    "\n",
    "with_reuse = {'key0': num, 'key1': num}\n",
    "without_reuse = {'key0': num, 'key1': np.int64(10)}\n",
    "\n",
    "assert with_reuse == without_reuse\n",
    "assert jl.hash(with_reuse) == jl.hash(without_reuse)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Even though `with_reuse` and `without_reuse` have the same values the hashes are different! This appears to be related to the memoization being done by `Pickle` which is what `joblib` uses to compute the hashes of objects. We can inspect the bytes produced by the hasher to verify this."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def jl_pickle_bytes(obj):\n",
    "    hasher = jl.hashing.NumpyHasher(hash_name='md5', coerce_mmap=False)\n",
    "    hasher.hash(obj)\n",
    "    return hasher.stream.getvalue()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickletools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    0: \\x80 PROTO      3\n",
      "    2: }    EMPTY_DICT\n",
      "    3: q    BINPUT     0\n",
      "    5: (    MARK\n",
      "    6: X        BINUNICODE 'key0'\n",
      "   15: c        GLOBAL     'joblib.hashing _MyHash'\n",
      "   39: q        BINPUT     1\n",
      "   41: )        EMPTY_TUPLE\n",
      "   42: \\x81     NEWOBJ\n",
      "   43: q        BINPUT     2\n",
      "   45: }        EMPTY_DICT\n",
      "   46: q        BINPUT     3\n",
      "   48: X        BINUNICODE 'args'\n",
      "   57: X        BINUNICODE 'scalar'\n",
      "   68: X        BINUNICODE 'numpy.core.multiarray'\n",
      "   94: \\x86     TUPLE2\n",
      "   95: q        BINPUT     4\n",
      "   97: s        SETITEM\n",
      "   98: b        BUILD\n",
      "   99: c        GLOBAL     'numpy dtype'\n",
      "  112: q        BINPUT     5\n",
      "  114: X        BINUNICODE 'HASHED'\n",
      "  125: ]        EMPTY_LIST\n",
      "  126: q        BINPUT     6\n",
      "  128: X        BINUNICODE ''\n",
      "  133: X        BINUNICODE '<i8'\n",
      "  141: \\x86     TUPLE2\n",
      "  142: q        BINPUT     7\n",
      "  144: a        APPEND\n",
      "  145: \\x86     TUPLE2\n",
      "  146: q        BINPUT     8\n",
      "  148: \\x86     TUPLE2\n",
      "  149: q        BINPUT     9\n",
      "  151: C        SHORT_BINBYTES b'\\n\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n",
      "  161: \\x86     TUPLE2\n",
      "  162: q        BINPUT     10\n",
      "  164: R        REDUCE\n",
      "  165: q        BINPUT     11\n",
      "  167: X        BINUNICODE 'key1'\n",
      "  176: h        BINGET     11\n",
      "  178: u        SETITEMS   (MARK at 5)\n",
      "  179: .    STOP\n",
      "highest protocol among opcodes = 3\n"
     ]
    }
   ],
   "source": [
    "pickletools.dis(jl_pickle_bytes(with_reuse))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "These lines in particular are where the memoization is happening for the `with_reuse` dict:\n",
    "```\n",
    "  162: q        BINPUT     10\n",
    "  164: R        REDUCE\n",
    "  165: q        BINPUT     11\n",
    "  167: X        BINUNICODE 'key1'\n",
    "  176: h        BINGET     11\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Comparing that to the byte code produced for the `without_reuse` dict you'll see no memoziation of the same numpy value. Hence, the longer bytes when hashed produces a different digest."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    0: \\x80 PROTO      3\n",
      "    2: }    EMPTY_DICT\n",
      "    3: q    BINPUT     0\n",
      "    5: (    MARK\n",
      "    6: X        BINUNICODE 'key0'\n",
      "   15: c        GLOBAL     'joblib.hashing _MyHash'\n",
      "   39: q        BINPUT     1\n",
      "   41: )        EMPTY_TUPLE\n",
      "   42: \\x81     NEWOBJ\n",
      "   43: q        BINPUT     2\n",
      "   45: }        EMPTY_DICT\n",
      "   46: q        BINPUT     3\n",
      "   48: X        BINUNICODE 'args'\n",
      "   57: X        BINUNICODE 'scalar'\n",
      "   68: X        BINUNICODE 'numpy.core.multiarray'\n",
      "   94: \\x86     TUPLE2\n",
      "   95: q        BINPUT     4\n",
      "   97: s        SETITEM\n",
      "   98: b        BUILD\n",
      "   99: c        GLOBAL     'numpy dtype'\n",
      "  112: q        BINPUT     5\n",
      "  114: X        BINUNICODE 'HASHED'\n",
      "  125: ]        EMPTY_LIST\n",
      "  126: q        BINPUT     6\n",
      "  128: X        BINUNICODE ''\n",
      "  133: X        BINUNICODE '<i8'\n",
      "  141: \\x86     TUPLE2\n",
      "  142: q        BINPUT     7\n",
      "  144: a        APPEND\n",
      "  145: \\x86     TUPLE2\n",
      "  146: q        BINPUT     8\n",
      "  148: \\x86     TUPLE2\n",
      "  149: q        BINPUT     9\n",
      "  151: C        SHORT_BINBYTES b'\\n\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n",
      "  161: \\x86     TUPLE2\n",
      "  162: q        BINPUT     10\n",
      "  164: R        REDUCE\n",
      "  165: q        BINPUT     11\n",
      "  167: X        BINUNICODE 'key1'\n",
      "  176: h        BINGET     1\n",
      "  178: )        EMPTY_TUPLE\n",
      "  179: \\x81     NEWOBJ\n",
      "  180: q        BINPUT     12\n",
      "  182: }        EMPTY_DICT\n",
      "  183: q        BINPUT     13\n",
      "  185: X        BINUNICODE 'args'\n",
      "  194: X        BINUNICODE 'scalar'\n",
      "  205: X        BINUNICODE 'numpy.core.multiarray'\n",
      "  231: \\x86     TUPLE2\n",
      "  232: q        BINPUT     14\n",
      "  234: s        SETITEM\n",
      "  235: b        BUILD\n",
      "  236: h        BINGET     5\n",
      "  238: X        BINUNICODE 'HASHED'\n",
      "  249: ]        EMPTY_LIST\n",
      "  250: q        BINPUT     15\n",
      "  252: X        BINUNICODE ''\n",
      "  257: X        BINUNICODE '<i8'\n",
      "  265: \\x86     TUPLE2\n",
      "  266: q        BINPUT     16\n",
      "  268: a        APPEND\n",
      "  269: \\x86     TUPLE2\n",
      "  270: q        BINPUT     17\n",
      "  272: \\x86     TUPLE2\n",
      "  273: q        BINPUT     18\n",
      "  275: C        SHORT_BINBYTES b'\\n\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n",
      "  285: \\x86     TUPLE2\n",
      "  286: q        BINPUT     19\n",
      "  288: R        REDUCE\n",
      "  289: q        BINPUT     20\n",
      "  291: u        SETITEMS   (MARK at 5)\n",
      "  292: .    STOP\n",
      "highest protocol among opcodes = 3\n"
     ]
    }
   ],
   "source": [
    "pickletools.dis(jl_pickle_bytes(without_reuse))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:bioid]",
   "language": "python",
   "name": "conda-env-bioid-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# joblib numpy pickle memoization bug"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"ename": "AssertionError",
	"evalue": "",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-1-becd01458d06>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mwith_reuse\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mwithout_reuse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mjl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhash\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwith_reuse\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mjl\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhash\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwithout_reuse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
	"\u001b[0;31mAssertionError\u001b[0m: "
	]
	}
	],
	"source": [
	"import joblib as jl\n",
	"import numpy as np\n",
	"\n",
	"num = np.int64(10)\n",
	"\n",
	"with_reuse = {'key0': num, 'key1': num}\n",
	"without_reuse = {'key0': num, 'key1': np.int64(10)}\n",
	"\n",
	"assert with_reuse == without_reuse\n",
	"assert jl.hash(with_reuse) == jl.hash(without_reuse)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Even though `with_reuse` and `without_reuse` have the same values the hashes are different! This appears to be related to the memoization being done by `Pickle` which is what `joblib` uses to compute the hashes of objects. We can inspect the bytes produced by the hasher to verify this."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def jl_pickle_bytes(obj):\n",
	" hasher = jl.hashing.NumpyHasher(hash_name='md5', coerce_mmap=False)\n",
	" hasher.hash(obj)\n",
	" return hasher.stream.getvalue()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pickletools"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 0: \\x80 PROTO 3\n",
	" 2: } EMPTY_DICT\n",
	" 3: q BINPUT 0\n",
	" 5: ( MARK\n",
	" 6: X BINUNICODE 'key0'\n",
	" 15: c GLOBAL 'joblib.hashing _MyHash'\n",
	" 39: q BINPUT 1\n",
	" 41: ) EMPTY_TUPLE\n",
	" 42: \\x81 NEWOBJ\n",
	" 43: q BINPUT 2\n",
	" 45: } EMPTY_DICT\n",
	" 46: q BINPUT 3\n",
	" 48: X BINUNICODE 'args'\n",
	" 57: X BINUNICODE 'scalar'\n",
	" 68: X BINUNICODE 'numpy.core.multiarray'\n",
	" 94: \\x86 TUPLE2\n",
	" 95: q BINPUT 4\n",
	" 97: s SETITEM\n",
	" 98: b BUILD\n",
	" 99: c GLOBAL 'numpy dtype'\n",
	" 112: q BINPUT 5\n",
	" 114: X BINUNICODE 'HASHED'\n",
	" 125: ] EMPTY_LIST\n",
	" 126: q BINPUT 6\n",
	" 128: X BINUNICODE ''\n",
	" 133: X BINUNICODE '<i8'\n",
	" 141: \\x86 TUPLE2\n",
	" 142: q BINPUT 7\n",
	" 144: a APPEND\n",
	" 145: \\x86 TUPLE2\n",
	" 146: q BINPUT 8\n",
	" 148: \\x86 TUPLE2\n",
	" 149: q BINPUT 9\n",
	" 151: C SHORT_BINBYTES b'\\n\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n",
	" 161: \\x86 TUPLE2\n",
	" 162: q BINPUT 10\n",
	" 164: R REDUCE\n",
	" 165: q BINPUT 11\n",
	" 167: X BINUNICODE 'key1'\n",
	" 176: h BINGET 11\n",
	" 178: u SETITEMS (MARK at 5)\n",
	" 179: . STOP\n",
	"highest protocol among opcodes = 3\n"
	]
	}
	],
	"source": [
	"pickletools.dis(jl_pickle_bytes(with_reuse))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"These lines in particular are where the memoization is happening for the `with_reuse` dict:\n",
	"```\n",
	" 162: q BINPUT 10\n",
	" 164: R REDUCE\n",
	" 165: q BINPUT 11\n",
	" 167: X BINUNICODE 'key1'\n",
	" 176: h BINGET 11\n",
	"```"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Comparing that to the byte code produced for the `without_reuse` dict you'll see no memoziation of the same numpy value. Hence, the longer bytes when hashed produces a different digest."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 0: \\x80 PROTO 3\n",
	" 2: } EMPTY_DICT\n",
	" 3: q BINPUT 0\n",
	" 5: ( MARK\n",
	" 6: X BINUNICODE 'key0'\n",
	" 15: c GLOBAL 'joblib.hashing _MyHash'\n",
	" 39: q BINPUT 1\n",
	" 41: ) EMPTY_TUPLE\n",
	" 42: \\x81 NEWOBJ\n",
	" 43: q BINPUT 2\n",
	" 45: } EMPTY_DICT\n",
	" 46: q BINPUT 3\n",
	" 48: X BINUNICODE 'args'\n",
	" 57: X BINUNICODE 'scalar'\n",
	" 68: X BINUNICODE 'numpy.core.multiarray'\n",
	" 94: \\x86 TUPLE2\n",
	" 95: q BINPUT 4\n",
	" 97: s SETITEM\n",
	" 98: b BUILD\n",
	" 99: c GLOBAL 'numpy dtype'\n",
	" 112: q BINPUT 5\n",
	" 114: X BINUNICODE 'HASHED'\n",
	" 125: ] EMPTY_LIST\n",
	" 126: q BINPUT 6\n",
	" 128: X BINUNICODE ''\n",
	" 133: X BINUNICODE '<i8'\n",
	" 141: \\x86 TUPLE2\n",
	" 142: q BINPUT 7\n",
	" 144: a APPEND\n",
	" 145: \\x86 TUPLE2\n",
	" 146: q BINPUT 8\n",
	" 148: \\x86 TUPLE2\n",
	" 149: q BINPUT 9\n",
	" 151: C SHORT_BINBYTES b'\\n\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n",
	" 161: \\x86 TUPLE2\n",
	" 162: q BINPUT 10\n",
	" 164: R REDUCE\n",
	" 165: q BINPUT 11\n",
	" 167: X BINUNICODE 'key1'\n",
	" 176: h BINGET 1\n",
	" 178: ) EMPTY_TUPLE\n",
	" 179: \\x81 NEWOBJ\n",
	" 180: q BINPUT 12\n",
	" 182: } EMPTY_DICT\n",
	" 183: q BINPUT 13\n",
	" 185: X BINUNICODE 'args'\n",
	" 194: X BINUNICODE 'scalar'\n",
	" 205: X BINUNICODE 'numpy.core.multiarray'\n",
	" 231: \\x86 TUPLE2\n",
	" 232: q BINPUT 14\n",
	" 234: s SETITEM\n",
	" 235: b BUILD\n",
	" 236: h BINGET 5\n",
	" 238: X BINUNICODE 'HASHED'\n",
	" 249: ] EMPTY_LIST\n",
	" 250: q BINPUT 15\n",
	" 252: X BINUNICODE ''\n",
	" 257: X BINUNICODE '<i8'\n",
	" 265: \\x86 TUPLE2\n",
	" 266: q BINPUT 16\n",
	" 268: a APPEND\n",
	" 269: \\x86 TUPLE2\n",
	" 270: q BINPUT 17\n",
	" 272: \\x86 TUPLE2\n",
	" 273: q BINPUT 18\n",
	" 275: C SHORT_BINBYTES b'\\n\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n",
	" 285: \\x86 TUPLE2\n",
	" 286: q BINPUT 19\n",
	" 288: R REDUCE\n",
	" 289: q BINPUT 20\n",
	" 291: u SETITEMS (MARK at 5)\n",
	" 292: . STOP\n",
	"highest protocol among opcodes = 3\n"
	]
	}
	],
	"source": [
	"pickletools.dis(jl_pickle_bytes(without_reuse))"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [conda env:bioid]",
	"language": "python",
	"name": "conda-env-bioid-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}