rbdixon/vectorize_kv.ipynb

## vectorize_kv.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Question: How should I transform multiple key/value columns in a scikit-learn pipeline?\n",
    "\n",
    "See http://stackoverflow.com/questions/31749812/how-should-i-transform-multiple-key-value-columns-in-a-scikit-learn-pipeline/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Input data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  k1  v1 k2  v2\n",
      "0  a   1  b   2\n",
      "1  b   2  c   3\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "D = pd.DataFrame([ ['a', 1, 'b', 2], ['b', 2, 'c', 3]], columns = ['k1', 'v1', 'k2', 'v2'])\n",
    "print(D)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is the type of output data that is required:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'a': 1, 'b': 2}, {'c': 3, 'b': 2}]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[ 1.,  2.,  0.],\n",
       "       [ 0.,  2.,  3.]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction import DictVectorizer\n",
    "\n",
    "row1 = {'a':1, 'b':2}\n",
    "row2 = {'b':2, 'c':3}\n",
    "data = [row1, row2]\n",
    "print(data)\n",
    "\n",
    "DictVectorizer( sparse=False ).fit_transform(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Solution\n",
    "\n",
    "Courtesy of [Mike](http://stackoverflow.com/users/2055368/mike): http://stackoverflow.com/a/31752733/1185562 and extended into a general pipeline transformer.\n",
    "\n",
    "Here is the transformer:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.base import TransformerMixin\n",
    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
    "\n",
    "class KVExtractor(TransformerMixin):\n",
    "    def __init__(self, kvpairs):\n",
    "        self.kpairs = kvpairs\n",
    "        \n",
    "    def transform(self, X, *_):\n",
    "        result = []\n",
    "        for index, rowdata in X.iterrows():\n",
    "            rowdict = {}\n",
    "            for kvp in self.kpairs:\n",
    "                rowdict.update( { rowdata[ kvp[0] ]: rowdata[ kvp[1] ] } )\n",
    "            result.append(rowdict)\n",
    "        return result\n",
    "    \n",
    "    def fit(self, *_):\n",
    "        return self"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lets try it out:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'a': 1, 'b': 2}, {'b': 2, 'c': 3}]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kvpairs = [ ['k1', 'v1'], ['k2', 'v2'] ]\n",
    "KVExtractor( kvpairs ).transform(D)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now try it out in a pipeline with `DictVectorizer`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  k1  v1 k2  v2\n",
      "0  a   1  b   2\n",
      "1  b   2  c   3\n",
      "(2, 3)\n",
      "[[ 1.  2.  0.]\n",
      " [ 0.  2.  3.]]\n"
     ]
    }
   ],
   "source": [
    "pipeline = Pipeline(\n",
    "    [( 'kv', KVExtractor( kvpairs ) )] +\n",
    "    [( 'dv', DictVectorizer(sparse=False) )] +\n",
    "    []\n",
    ")\n",
    "print(D)\n",
    "A=pipeline.fit_transform(D)\n",
    "print A.shape\n",
    "print A"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Try a new key without transforming:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  k1  v1 k2  v2\n",
      "0  a   1  x   2\n",
      "1  b   2  c   3\n",
      "[[ 1.  0.  0.]\n",
      " [ 0.  2.  3.]]\n"
     ]
    }
   ],
   "source": [
    "D['k2'] = ['x', 'c']\n",
    "print D\n",
    "print pipeline.transform(D)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Perfect!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'KVExtractor' object has no attribute 'inverse_transform'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-17-9e5154ad285e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minverse_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/opt/boxen/homebrew/lib/python2.7/site-packages/sklearn/utils/metaestimators.pyc\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     35\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_attribute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     36\u001b[0m         \u001b[0;31m# lambda, but not partial, allows help() to work with update_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m         \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     38\u001b[0m         \u001b[0;31m# update the docstring of the returned function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     39\u001b[0m         \u001b[0mupdate_wrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/boxen/homebrew/lib/python2.7/site-packages/sklearn/pipeline.pyc\u001b[0m in \u001b[0;36minverse_transform\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m    265\u001b[0m         \u001b[0mXt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    266\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstep\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 267\u001b[0;31m             \u001b[0mXt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minverse_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    268\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mXt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    269\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'KVExtractor' object has no attribute 'inverse_transform'"
     ]
    }
   ],
   "source": [
    "pipeline.inverse_transform(A)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Question: How should I transform multiple key/value columns in a scikit-learn pipeline?\n",
	"\n",
	"See http://stackoverflow.com/questions/31749812/how-should-i-transform-multiple-key-value-columns-in-a-scikit-learn-pipeline/"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Input data:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" k1 v1 k2 v2\n",
	"0 a 1 b 2\n",
	"1 b 2 c 3\n"
	]
	}
	],
	"source": [
	"import pandas as pd\n",
	"\n",
	"D = pd.DataFrame([ ['a', 1, 'b', 2], ['b', 2, 'c', 3]], columns = ['k1', 'v1', 'k2', 'v2'])\n",
	"print(D)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This is the type of output data that is required:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[{'a': 1, 'b': 2}, {'c': 3, 'b': 2}]\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"array([[ 1., 2., 0.],\n",
	" [ 0., 2., 3.]])"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.feature_extraction import DictVectorizer\n",
	"\n",
	"row1 = {'a':1, 'b':2}\n",
	"row2 = {'b':2, 'c':3}\n",
	"data = [row1, row2]\n",
	"print(data)\n",
	"\n",
	"DictVectorizer( sparse=False ).fit_transform(data)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"# Solution\n",
	"\n",
	"Courtesy of [Mike](http://stackoverflow.com/users/2055368/mike): http://stackoverflow.com/a/31752733/1185562 and extended into a general pipeline transformer.\n",
	"\n",
	"Here is the transformer:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.base import TransformerMixin\n",
	"from sklearn.pipeline import Pipeline, FeatureUnion\n",
	"\n",
	"class KVExtractor(TransformerMixin):\n",
	" def __init__(self, kvpairs):\n",
	" self.kpairs = kvpairs\n",
	" \n",
	" def transform(self, X, *_):\n",
	" result = []\n",
	" for index, rowdata in X.iterrows():\n",
	" rowdict = {}\n",
	" for kvp in self.kpairs:\n",
	" rowdict.update( { rowdata[ kvp[0] ]: rowdata[ kvp[1] ] } )\n",
	" result.append(rowdict)\n",
	" return result\n",
	" \n",
	" def fit(self, *_):\n",
	" return self"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Lets try it out:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[{'a': 1, 'b': 2}, {'b': 2, 'c': 3}]"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"kvpairs = [ ['k1', 'v1'], ['k2', 'v2'] ]\n",
	"KVExtractor( kvpairs ).transform(D)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now try it out in a pipeline with `DictVectorizer`:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" k1 v1 k2 v2\n",
	"0 a 1 b 2\n",
	"1 b 2 c 3\n",
	"(2, 3)\n",
	"[[ 1. 2. 0.]\n",
	" [ 0. 2. 3.]]\n"
	]
	}
	],
	"source": [
	"pipeline = Pipeline(\n",
	" [( 'kv', KVExtractor( kvpairs ) )] +\n",
	" [( 'dv', DictVectorizer(sparse=False) )] +\n",
	" []\n",
	")\n",
	"print(D)\n",
	"A=pipeline.fit_transform(D)\n",
	"print A.shape\n",
	"print A"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Try a new key without transforming:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" k1 v1 k2 v2\n",
	"0 a 1 x 2\n",
	"1 b 2 c 3\n",
	"[[ 1. 0. 0.]\n",
	" [ 0. 2. 3.]]\n"
	]
	}
	],
	"source": [
	"D['k2'] = ['x', 'c']\n",
	"print D\n",
	"print pipeline.transform(D)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Perfect!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"ename": "AttributeError",
	"evalue": "'KVExtractor' object has no attribute 'inverse_transform'",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-17-9e5154ad285e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpipeline\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minverse_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mA\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
	"\u001b[0;32m/opt/boxen/homebrew/lib/python2.7/site-packages/sklearn/utils/metaestimators.pyc\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(args, kwargs)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_attribute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;31m# lambda, but not partial, allows help() to work with update_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0;31m# update the docstring of the returned function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0mupdate_wrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m/opt/boxen/homebrew/lib/python2.7/site-packages/sklearn/pipeline.pyc\u001b[0m in \u001b[0;36minverse_transform\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[0mXt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 266\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstep\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 267\u001b[0;31m \u001b[0mXt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstep\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minverse_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 268\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mXt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 269\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mAttributeError\u001b[0m: 'KVExtractor' object has no attribute 'inverse_transform'"
	]
	}
	],
	"source": [
	"pipeline.inverse_transform(A)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}