/vectorize_kv.ipynb

## vectorize_kv.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Question: How should I transform multiple key/value columns in a scikit-learn pipeline?\n",
    "\n",
    "See http://stackoverflow.com/questions/31749812/how-should-i-transform-multiple-key-value-columns-in-a-scikit-learn-pipeline/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Input data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  k1  v1 k2  v2\n",
      "0  a   1  b   2\n",
      "1  b   2  c   3\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "D = pd.DataFrame([ ['a', 1, 'b', 2], ['b', 2, 'c', 3]], columns = ['k1', 'v1', 'k2', 'v2'])\n",
    "print(D)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is the type of output data that is required:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'a': 1, 'b': 2}, {'c': 3, 'b': 2}]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[ 1.,  2.,  0.],\n",
       "       [ 0.,  2.,  3.]])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction import DictVectorizer\n",
    "\n",
    "row1 = {'a':1, 'b':2}\n",
    "row2 = {'b':2, 'c':3}\n",
    "data = [row1, row2]\n",
    "print(data)\n",
    "\n",
    "DictVectorizer( sparse=False ).fit_transform(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Solution\n",
    "\n",
    "Courtesy of [Mike](http://stackoverflow.com/users/2055368/mike): http://stackoverflow.com/a/31752733/1185562 and extended into a general pipeline transformer.\n",
    "\n",
    "Here is the transformer:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.base import TransformerMixin\n",
    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
    "\n",
    "class KVExtractor(TransformerMixin):\n",
    "    def __init__(self, kvpairs):\n",
    "        self.kpairs = kvpairs\n",
    "        \n",
    "    def transform(self, X, *_):\n",
    "        result = []\n",
    "        for index, rowdata in X.iterrows():\n",
    "            rowdict = {}\n",
    "            for kvp in self.kpairs:\n",
    "                rowdict.update( { rowdata[ kvp[0] ]: rowdata[ kvp[1] ] } )\n",
    "            result.append(rowdict)\n",
    "        return result\n",
    "\n",
    "    def fit(self, *_):\n",
    "        return self"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lets try it out:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'a': 1, 'b': 2}, {'b': 2, 'c': 3}]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kvpairs = [ ['k1', 'v1'], ['k2', 'v2'] ]\n",
    "KVExtractor( kvpairs ).transform(D)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now try it out in a pipeline with `DictVectorizer`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  k1  v1 k2  v2\n",
      "0  a   1  b   2\n",
      "1  b   2  c   3\n",
      "(2, 3)\n",
      "[[ 1.  2.  0.]\n",
      " [ 0.  2.  3.]]\n"
     ]
    }
   ],
   "source": [
    "pipeline = Pipeline(\n",
    "    [( 'kv', KVExtractor( kvpairs ) )] +\n",
    "    [( 'dv', DictVectorizer(sparse=False) )] +\n",
    "    []\n",
    ")\n",
    "print(D)\n",
    "A=pipeline.fit_transform(D)\n",
    "print A.shape\n",
    "print A"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Perfect!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Question: How should I transform multiple key/value columns in a scikit-learn pipeline?\n",
	"\n",
	"See http://stackoverflow.com/questions/31749812/how-should-i-transform-multiple-key-value-columns-in-a-scikit-learn-pipeline/"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Input data:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" k1 v1 k2 v2\n",
	"0 a 1 b 2\n",
	"1 b 2 c 3\n"
	]
	}
	],
	"source": [
	"import pandas as pd\n",
	"\n",
	"D = pd.DataFrame([ ['a', 1, 'b', 2], ['b', 2, 'c', 3]], columns = ['k1', 'v1', 'k2', 'v2'])\n",
	"print(D)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"This is the type of output data that is required:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[{'a': 1, 'b': 2}, {'c': 3, 'b': 2}]\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"array([[ 1., 2., 0.],\n",
	" [ 0., 2., 3.]])"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.feature_extraction import DictVectorizer\n",
	"\n",
	"row1 = {'a':1, 'b':2}\n",
	"row2 = {'b':2, 'c':3}\n",
	"data = [row1, row2]\n",
	"print(data)\n",
	"\n",
	"DictVectorizer( sparse=False ).fit_transform(data)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"# Solution\n",
	"\n",
	"Courtesy of [Mike](http://stackoverflow.com/users/2055368/mike): http://stackoverflow.com/a/31752733/1185562 and extended into a general pipeline transformer.\n",
	"\n",
	"Here is the transformer:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.base import TransformerMixin\n",
	"from sklearn.pipeline import Pipeline, FeatureUnion\n",
	"\n",
	"class KVExtractor(TransformerMixin):\n",
	" def __init__(self, kvpairs):\n",
	" self.kpairs = kvpairs\n",
	" \n",
	" def transform(self, X, *_):\n",
	" result = []\n",
	" for index, rowdata in X.iterrows():\n",
	" rowdict = {}\n",
	" for kvp in self.kpairs:\n",
	" rowdict.update( { rowdata[ kvp[0] ]: rowdata[ kvp[1] ] } )\n",
	" result.append(rowdict)\n",
	" return result\n",
	"\n",
	" def fit(self, *_):\n",
	" return self"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Lets try it out:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[{'a': 1, 'b': 2}, {'b': 2, 'c': 3}]"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"kvpairs = [ ['k1', 'v1'], ['k2', 'v2'] ]\n",
	"KVExtractor( kvpairs ).transform(D)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Now try it out in a pipeline with `DictVectorizer`:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" k1 v1 k2 v2\n",
	"0 a 1 b 2\n",
	"1 b 2 c 3\n",
	"(2, 3)\n",
	"[[ 1. 2. 0.]\n",
	" [ 0. 2. 3.]]\n"
	]
	}
	],
	"source": [
	"pipeline = Pipeline(\n",
	" [( 'kv', KVExtractor( kvpairs ) )] +\n",
	" [( 'dv', DictVectorizer(sparse=False) )] +\n",
	" []\n",
	")\n",
	"print(D)\n",
	"A=pipeline.fit_transform(D)\n",
	"print A.shape\n",
	"print A"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Perfect!"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}