TaylorOshan/sparse_categorical.ipynb

## sparse_categorical.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy import sparse as sp\n",
    "from statsmodels.tools.tools import categorical\n",
    "from datetime import datetime as dt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def spcategorical(data):\n",
    "    '''\n",
    "    Returns a dummy matrix given an array of categorical variables.\n",
    "    Parameters\n",
    "    ----------\n",
    "    data : array\n",
    "        A 1d vector of the categorical variable.\n",
    "\n",
    "    Returns\n",
    "    --------\n",
    "    dummy_matrix\n",
    "        A sparse matrix of dummy (indicator/binary) float variables for the\n",
    "        categorical data.  \n",
    "\n",
    "    '''\n",
    "    if np.squeeze(data).ndim == 1:\n",
    "        tmp_arr = np.unique(data)\n",
    "        tmp_dummy = sp.csr_matrix((0, len(data)))\n",
    "        for each in tmp_arr[:, None]:\n",
    "            row = sp.csr_matrix((each == data).astype(float))\n",
    "            tmp_dummy = sp.vstack([tmp_dummy, row])\n",
    "        tmp_dummy = tmp_dummy.T\n",
    "        return tmp_dummy\n",
    "    else:\n",
    "        raise IndexError(\"The index %s is not understood\" % col)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = np.random.randint(1,100, 10000)\n",
    "np.allclose(spcategorical(np.array(data)).toarray(), categorical(np.array(data), drop=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0:10:29.610236\n"
     ]
    }
   ],
   "source": [
    "s = dt.now()\n",
    "n = 3000\n",
    "o = np.tile(np.arange(n),n)\n",
    "o_dums = spcategorical(np.array(o))\n",
    "n = 3000\n",
    "d = np.repeat(np.arange(n),n)\n",
    "d_dums = spcategorical(np.array(d))\n",
    "sp.hstack((o_dums, d_dums))\n",
    "e = dt.now()\n",
    "print e-s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<9000000x6000 sparse matrix of type '<type 'numpy.float64'>'\n",
       "\twith 18000000 stored elements in Compressed Sparse Column format>"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_dums = sp.hstack((o_dums, d_dums))\n",
    "all_dums"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 53,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"from scipy import sparse as sp\n",
	"from statsmodels.tools.tools import categorical\n",
	"from datetime import datetime as dt\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 57,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def spcategorical(data):\n",
	" '''\n",
	" Returns a dummy matrix given an array of categorical variables.\n",
	" Parameters\n",
	" ----------\n",
	" data : array\n",
	" A 1d vector of the categorical variable.\n",
	"\n",
	" Returns\n",
	" --------\n",
	" dummy_matrix\n",
	" A sparse matrix of dummy (indicator/binary) float variables for the\n",
	" categorical data. \n",
	"\n",
	" '''\n",
	" if np.squeeze(data).ndim == 1:\n",
	" tmp_arr = np.unique(data)\n",
	" tmp_dummy = sp.csr_matrix((0, len(data)))\n",
	" for each in tmp_arr[:, None]:\n",
	" row = sp.csr_matrix((each == data).astype(float))\n",
	" tmp_dummy = sp.vstack([tmp_dummy, row])\n",
	" tmp_dummy = tmp_dummy.T\n",
	" return tmp_dummy\n",
	" else:\n",
	" raise IndexError(\"The index %s is not understood\" % col)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 58,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 58,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"data = np.random.randint(1,100, 10000)\n",
	"np.allclose(spcategorical(np.array(data)).toarray(), categorical(np.array(data), drop=True))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0:10:29.610236\n"
	]
	}
	],
	"source": [
	"s = dt.now()\n",
	"n = 3000\n",
	"o = np.tile(np.arange(n),n)\n",
	"o_dums = spcategorical(np.array(o))\n",
	"n = 3000\n",
	"d = np.repeat(np.arange(n),n)\n",
	"d_dums = spcategorical(np.array(d))\n",
	"sp.hstack((o_dums, d_dums))\n",
	"e = dt.now()\n",
	"print e-s"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 59,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<9000000x6000 sparse matrix of type '<type 'numpy.float64'>'\n",
	"\twith 18000000 stored elements in Compressed Sparse Column format>"
	]
	},
	"execution_count": 59,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"all_dums = sp.hstack((o_dums, d_dums))\n",
	"all_dums"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}