davidshinn/LabelEncoderDictVectorizer.ipynb

## LabelEncoderDictVectorizer.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How does `LabelEncoder` work"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It turns all variables into integers from 0 to n_classes - 1, but applies label after sorting, so is order invariant to the original data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Encode labels with value between 0 and n_classes-1.\n",
      "\n",
      "    Attributes\n",
      "    ----------\n",
      "    classes_ : array of shape (n_class,)\n",
      "        Holds the label for each class.\n",
      "\n",
      "    Examples\n",
      "    --------\n",
      "    `LabelEncoder` can be used to normalize labels.\n",
      "\n",
      "    >>> from sklearn import preprocessing\n",
      "    >>> le = preprocessing.LabelEncoder()\n",
      "    >>> le.fit([1, 2, 2, 6])\n",
      "    LabelEncoder()\n",
      "    >>> le.classes_\n",
      "    array([1, 2, 6])\n",
      "    >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS\n",
      "    array([0, 0, 1, 2]...)\n",
      "    >>> le.inverse_transform([0, 0, 1, 2])\n",
      "    array([1, 1, 2, 6])\n",
      "\n",
      "    It can also be used to transform non-numerical labels (as long as they are\n",
      "    hashable and comparable) to numerical labels.\n",
      "\n",
      "    >>> le = preprocessing.LabelEncoder()\n",
      "    >>> le.fit([\"paris\", \"paris\", \"tokyo\", \"amsterdam\"])\n",
      "    LabelEncoder()\n",
      "    >>> list(le.classes_)\n",
      "    ['amsterdam', 'paris', 'tokyo']\n",
      "    >>> le.transform([\"tokyo\", \"tokyo\", \"paris\"]) #doctest: +ELLIPSIS\n",
      "    array([2, 2, 1]...)\n",
      "    >>> list(le.inverse_transform([2, 2, 1]))\n",
      "    ['tokyo', 'tokyo', 'paris']\n",
      "\n",
      "    \n"
     ]
    }
   ],
   "source": [
    "print LabelEncoder.__doc__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "le = LabelEncoder()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Variables are in order"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 0, 2], dtype=int64)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "le.fit([1, 2, 2, 6])\n",
    "le.transform([1, 2, 1, 6])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Variables shuffled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 0, 2], dtype=int64)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "le.fit([6, 2, 1, 2])\n",
    "le.transform([1, 2, 1, 6])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How does `DictVectorizer` work"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One hot encodes non-numeric fields, leaves numeric fields intact"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Transforms lists of feature-value mappings to vectors.\n",
      "\n",
      "    This transformer turns lists of mappings (dict-like objects) of feature\n",
      "    names to feature values into Numpy arrays or scipy.sparse matrices for use\n",
      "    with scikit-learn estimators.\n",
      "\n",
      "    When feature values are strings, this transformer will do a binary one-hot\n",
      "    (aka one-of-K) coding: one boolean-valued feature is constructed for each\n",
      "    of the possible string values that the feature can take on. For instance,\n",
      "    a feature \"f\" that can take on the values \"ham\" and \"spam\" will become two\n",
      "    features in the output, one signifying \"f=ham\", the other \"f=spam\".\n",
      "\n",
      "    Features that do not occur in a sample (mapping) will have a zero value\n",
      "    in the resulting array/matrix.\n",
      "\n",
      "    Parameters\n",
      "    ----------\n",
      "    dtype : callable, optional\n",
      "        The type of feature values. Passed to Numpy array/scipy.sparse matrix\n",
      "        constructors as the dtype argument.\n",
      "    separator: string, optional\n",
      "        Separator string used when constructing new features for one-hot\n",
      "        coding.\n",
      "    sparse: boolean, optional.\n",
      "        Whether transform should produce scipy.sparse matrices.\n",
      "        True by default.\n",
      "    sort: boolean, optional.\n",
      "        Whether ``feature_names_`` and ``vocabulary_`` should be sorted when fitting.\n",
      "        True by default.\n",
      "\n",
      "    Attributes\n",
      "    ----------\n",
      "    vocabulary_ : dict\n",
      "        A dictionary mapping feature names to feature indices.\n",
      "\n",
      "    feature_names_ : list\n",
      "        A list of length n_features containing the feature names (e.g., \"f=ham\"\n",
      "        and \"f=spam\").\n",
      "\n",
      "    Examples\n",
      "    --------\n",
      "    >>> from sklearn.feature_extraction import DictVectorizer\n",
      "    >>> v = DictVectorizer(sparse=False)\n",
      "    >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]\n",
      "    >>> X = v.fit_transform(D)\n",
      "    >>> X\n",
      "    array([[ 2.,  0.,  1.],\n",
      "           [ 0.,  1.,  3.]])\n",
      "    >>> v.inverse_transform(X) ==         [{'bar': 2.0, 'foo': 1.0}, {'baz': 1.0, 'foo': 3.0}]\n",
      "    True\n",
      "    >>> v.transform({'foo': 4, 'unseen_feature': 3})\n",
      "    array([[ 0.,  0.,  4.]])\n",
      "\n",
      "    See also\n",
      "    --------\n",
      "    FeatureHasher : performs vectorization using only a hash function.\n",
      "    sklearn.preprocessing.OneHotEncoder : handles nominal/categorical features\n",
      "      encoded as columns of integers.\n",
      "    \n"
     ]
    }
   ],
   "source": [
    "print DictVectorizer.__doc__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>var1</th>\n",
       "      <th>var2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.5</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>maybe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.2</td>\n",
       "      <td>definitely</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.0</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   var1        var2\n",
       "0   1.0         yes\n",
       "1   2.5          no\n",
       "2   3.0       maybe\n",
       "3   1.2  definitely\n",
       "4   1.0          no"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame({'var1': [1, 2.5, 3, 1.2, 1.0], 'var2': ['yes', 'no', 'maybe', 'definitely', 'no']})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>var1</th>\n",
       "      <td>1</td>\n",
       "      <td>2.5</td>\n",
       "      <td>3</td>\n",
       "      <td>1.2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>var2</th>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>maybe</td>\n",
       "      <td>definitely</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        0    1      2           3   4\n",
       "var1    1  2.5      3         1.2   1\n",
       "var2  yes   no  maybe  definitely  no"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: {'var1': 1.0, 'var2': 'yes'},\n",
       " 1: {'var1': 2.5, 'var2': 'no'},\n",
       " 2: {'var1': 3.0, 'var2': 'maybe'},\n",
       " 3: {'var1': 1.2, 'var2': 'definitely'},\n",
       " 4: {'var1': 1.0, 'var2': 'no'}}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.T.to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'var1': 1.0, 'var2': 'yes'},\n",
       " {'var1': 2.5, 'var2': 'no'},\n",
       " {'var1': 3.0, 'var2': 'maybe'},\n",
       " {'var1': 1.2, 'var2': 'definitely'},\n",
       " {'var1': 1.0, 'var2': 'no'}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.T.to_dict().values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "vec = DictVectorizer()\n",
    "vectorized = vec.fit_transform(df.T.to_dict().values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5, 5)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorized.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 1.   0.   0.   0.   1. ]\n",
      " [ 2.5  0.   0.   1.   0. ]\n",
      " [ 3.   0.   1.   0.   0. ]\n",
      " [ 1.2  1.   0.   0.   0. ]\n",
      " [ 1.   0.   0.   1.   0. ]]\n"
     ]
    }
   ],
   "source": [
    "print vectorized.toarray()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from sklearn.feature_extraction import DictVectorizer\n",
	"from sklearn.preprocessing import LabelEncoder"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## How does `LabelEncoder` work"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"It turns all variables into integers from 0 to n_classes - 1, but applies label after sorting, so is order invariant to the original data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Encode labels with value between 0 and n_classes-1.\n",
	"\n",
	" Attributes\n",
	" ----------\n",
	" classes_ : array of shape (n_class,)\n",
	" Holds the label for each class.\n",
	"\n",
	" Examples\n",
	" --------\n",
	" `LabelEncoder` can be used to normalize labels.\n",
	"\n",
	" >>> from sklearn import preprocessing\n",
	" >>> le = preprocessing.LabelEncoder()\n",
	" >>> le.fit([1, 2, 2, 6])\n",
	" LabelEncoder()\n",
	" >>> le.classes_\n",
	" array([1, 2, 6])\n",
	" >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS\n",
	" array([0, 0, 1, 2]...)\n",
	" >>> le.inverse_transform([0, 0, 1, 2])\n",
	" array([1, 1, 2, 6])\n",
	"\n",
	" It can also be used to transform non-numerical labels (as long as they are\n",
	" hashable and comparable) to numerical labels.\n",
	"\n",
	" >>> le = preprocessing.LabelEncoder()\n",
	" >>> le.fit([\"paris\", \"paris\", \"tokyo\", \"amsterdam\"])\n",
	" LabelEncoder()\n",
	" >>> list(le.classes_)\n",
	" ['amsterdam', 'paris', 'tokyo']\n",
	" >>> le.transform([\"tokyo\", \"tokyo\", \"paris\"]) #doctest: +ELLIPSIS\n",
	" array([2, 2, 1]...)\n",
	" >>> list(le.inverse_transform([2, 2, 1]))\n",
	" ['tokyo', 'tokyo', 'paris']\n",
	"\n",
	" \n"
	]
	}
	],
	"source": [
	"print LabelEncoder.__doc__"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"le = LabelEncoder()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Variables are in order"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([0, 1, 0, 2], dtype=int64)"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"le.fit([1, 2, 2, 6])\n",
	"le.transform([1, 2, 1, 6])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Variables shuffled"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([0, 1, 0, 2], dtype=int64)"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"le.fit([6, 2, 1, 2])\n",
	"le.transform([1, 2, 1, 6])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## How does `DictVectorizer` work"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"One hot encodes non-numeric fields, leaves numeric fields intact"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Transforms lists of feature-value mappings to vectors.\n",
	"\n",
	" This transformer turns lists of mappings (dict-like objects) of feature\n",
	" names to feature values into Numpy arrays or scipy.sparse matrices for use\n",
	" with scikit-learn estimators.\n",
	"\n",
	" When feature values are strings, this transformer will do a binary one-hot\n",
	" (aka one-of-K) coding: one boolean-valued feature is constructed for each\n",
	" of the possible string values that the feature can take on. For instance,\n",
	" a feature \"f\" that can take on the values \"ham\" and \"spam\" will become two\n",
	" features in the output, one signifying \"f=ham\", the other \"f=spam\".\n",
	"\n",
	" Features that do not occur in a sample (mapping) will have a zero value\n",
	" in the resulting array/matrix.\n",
	"\n",
	" Parameters\n",
	" ----------\n",
	" dtype : callable, optional\n",
	" The type of feature values. Passed to Numpy array/scipy.sparse matrix\n",
	" constructors as the dtype argument.\n",
	" separator: string, optional\n",
	" Separator string used when constructing new features for one-hot\n",
	" coding.\n",
	" sparse: boolean, optional.\n",
	" Whether transform should produce scipy.sparse matrices.\n",
	" True by default.\n",
	" sort: boolean, optional.\n",
	" Whether ``feature_names_`` and ``vocabulary_`` should be sorted when fitting.\n",
	" True by default.\n",
	"\n",
	" Attributes\n",
	" ----------\n",
	" vocabulary_ : dict\n",
	" A dictionary mapping feature names to feature indices.\n",
	"\n",
	" feature_names_ : list\n",
	" A list of length n_features containing the feature names (e.g., \"f=ham\"\n",
	" and \"f=spam\").\n",
	"\n",
	" Examples\n",
	" --------\n",
	" >>> from sklearn.feature_extraction import DictVectorizer\n",
	" >>> v = DictVectorizer(sparse=False)\n",
	" >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]\n",
	" >>> X = v.fit_transform(D)\n",
	" >>> X\n",
	" array([[ 2., 0., 1.],\n",
	" [ 0., 1., 3.]])\n",
	" >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0}, {'baz': 1.0, 'foo': 3.0}]\n",
	" True\n",
	" >>> v.transform({'foo': 4, 'unseen_feature': 3})\n",
	" array([[ 0., 0., 4.]])\n",
	"\n",
	" See also\n",
	" --------\n",
	" FeatureHasher : performs vectorization using only a hash function.\n",
	" sklearn.preprocessing.OneHotEncoder : handles nominal/categorical features\n",
	" encoded as columns of integers.\n",
	" \n"
	]
	}
	],
	"source": [
	"print DictVectorizer.__doc__"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>var1</th>\n",
	" <th>var2</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>1.0</td>\n",
	" <td>yes</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>2.5</td>\n",
	" <td>no</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>3.0</td>\n",
	" <td>maybe</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>1.2</td>\n",
	" <td>definitely</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>1.0</td>\n",
	" <td>no</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" var1 var2\n",
	"0 1.0 yes\n",
	"1 2.5 no\n",
	"2 3.0 maybe\n",
	"3 1.2 definitely\n",
	"4 1.0 no"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = pd.DataFrame({'var1': [1, 2.5, 3, 1.2, 1.0], 'var2': ['yes', 'no', 'maybe', 'definitely', 'no']})\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" <th>3</th>\n",
	" <th>4</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>var1</th>\n",
	" <td>1</td>\n",
	" <td>2.5</td>\n",
	" <td>3</td>\n",
	" <td>1.2</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>var2</th>\n",
	" <td>yes</td>\n",
	" <td>no</td>\n",
	" <td>maybe</td>\n",
	" <td>definitely</td>\n",
	" <td>no</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0 1 2 3 4\n",
	"var1 1 2.5 3 1.2 1\n",
	"var2 yes no maybe definitely no"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.T"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{0: {'var1': 1.0, 'var2': 'yes'},\n",
	" 1: {'var1': 2.5, 'var2': 'no'},\n",
	" 2: {'var1': 3.0, 'var2': 'maybe'},\n",
	" 3: {'var1': 1.2, 'var2': 'definitely'},\n",
	" 4: {'var1': 1.0, 'var2': 'no'}}"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.T.to_dict()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[{'var1': 1.0, 'var2': 'yes'},\n",
	" {'var1': 2.5, 'var2': 'no'},\n",
	" {'var1': 3.0, 'var2': 'maybe'},\n",
	" {'var1': 1.2, 'var2': 'definitely'},\n",
	" {'var1': 1.0, 'var2': 'no'}]"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df.T.to_dict().values()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"vec = DictVectorizer()\n",
	"vectorized = vec.fit_transform(df.T.to_dict().values())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(5, 5)"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"vectorized.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[[ 1. 0. 0. 0. 1. ]\n",
	" [ 2.5 0. 0. 1. 0. ]\n",
	" [ 3. 0. 1. 0. 0. ]\n",
	" [ 1.2 1. 0. 0. 0. ]\n",
	" [ 1. 0. 0. 1. 0. ]]\n"
	]
	}
	],
	"source": [
	"print vectorized.toarray()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}