Created
June 4, 2016 05:03
-
-
Save TaylorOshan/45cd01bb08e23280549aee770a05cdfe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"from scipy import sparse as sp\n", | |
"from statsmodels.tools.tools import categorical\n", | |
"from datetime import datetime as dt\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def spcategorical(data):\n", | |
" '''\n", | |
" Returns a dummy matrix given an array of categorical variables.\n", | |
" Parameters\n", | |
" ----------\n", | |
" data : array\n", | |
" A 1d vector of the categorical variable.\n", | |
"\n", | |
" Returns\n", | |
" --------\n", | |
" dummy_matrix\n", | |
" A sparse matrix of dummy (indicator/binary) float variables for the\n", | |
" categorical data. \n", | |
"\n", | |
" '''\n", | |
" if np.squeeze(data).ndim == 1:\n", | |
" tmp_arr = np.unique(data)\n", | |
" tmp_dummy = sp.csr_matrix((0, len(data)))\n", | |
" for each in tmp_arr[:, None]:\n", | |
" row = sp.csr_matrix((each == data).astype(float))\n", | |
" tmp_dummy = sp.vstack([tmp_dummy, row])\n", | |
" tmp_dummy = tmp_dummy.T\n", | |
" return tmp_dummy\n", | |
" else:\n", | |
" raise IndexError(\"The index %s is not understood\" % col)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 58, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 58, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data = np.random.randint(1,100, 10000)\n", | |
"np.allclose(spcategorical(np.array(data)).toarray(), categorical(np.array(data), drop=True))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0:10:29.610236\n" | |
] | |
} | |
], | |
"source": [ | |
"s = dt.now()\n", | |
"n = 3000\n", | |
"o = np.tile(np.arange(n),n)\n", | |
"o_dums = spcategorical(np.array(o))\n", | |
"n = 3000\n", | |
"d = np.repeat(np.arange(n),n)\n", | |
"d_dums = spcategorical(np.array(d))\n", | |
"sp.hstack((o_dums, d_dums))\n", | |
"e = dt.now()\n", | |
"print e-s" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<9000000x6000 sparse matrix of type '<type 'numpy.float64'>'\n", | |
"\twith 18000000 stored elements in Compressed Sparse Column format>" | |
] | |
}, | |
"execution_count": 59, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"all_dums = sp.hstack((o_dums, d_dums))\n", | |
"all_dums" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment