Skip to content

Instantly share code, notes, and snippets.

@davclark
Last active August 12, 2016 17:09
Show Gist options
  • Save davclark/b6ab9ef9ad92cb74bc25c367e0752780 to your computer and use it in GitHub Desktop.
Save davclark/b6ab9ef9ad92cb74bc25c367e0752780 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from objarray import *\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import timeit, random\n",
"\n",
"class TestObject(object):\n",
" \"\"\" A simple class of object for use in benchmark tests.\n",
" \"\"\"\n",
" def __init__(self):\n",
" \"\"\"create an object with random .x,.y and .z attributes\"\"\"\n",
" self.x = random.random()\n",
" self.y = random.random()\n",
" self.z = random.random()\n",
"\n",
" def product(self): return self.x * self.y * self.z\n",
"\n",
"# Since record arrays' and dataframes' analogs of objects can't have methods of their own,\n",
"# the following function serves an equivalent purpose, to the product method defined above.\n",
"def manual_product(obj): return obj.x * obj.y * obj.z\n",
"\n",
"# The above manual product function requires that the obj passed to it be some sort of\n",
"# object with magic methods assinging meanings to .x, .y and .z. Pandas apply documentation\n",
"# claims to have faster performance if you instead have it pass mere NumPy arrays as\n",
"# the object-like argument to the applied method. This comes at the inconvenience of\n",
"# not being able to use dot-idiom attribute-reference in your OOP code. The following\n",
"# method is used to see how much speed one can gain at cost of this inconvenience.\n",
"def manual_product_by_index(obj): return obj[0] * obj[1] * obj[2]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Created objects.\n",
"Created ObjArray.\n",
"Created coupled ObjArray.\n",
"Created buffered method to call.\n",
"Created Record Array.\n",
"Created DataFrame.\n"
]
}
],
"source": [
"SIZE = 1000 # We will generate a SIZE x SIZE array of objects\n",
"\n",
"nested_lists = []\n",
"for i in range(SIZE):\n",
" row = []\n",
" for _ in range(SIZE): row.append(TestObject()) # append SIZE test objects into row\n",
" nested_lists.append(row)\n",
"print(\"Created objects.\")\n",
"\n",
"OA = ObjArray(nested_lists, attributes=('x','y','z') )\n",
"print(\"Created ObjArray.\")\n",
"\n",
"OA.new_coupled_buffer('y') # will leave x uncoupled for testing it\n",
"print(\"Created coupled ObjArray.\")\n",
"\n",
"buffered_product_method = OA.product # a new objarray containing the product methods of OA's objects\n",
"print(\"Created buffered method to call.\")\n",
"\n",
"\n",
"RA = np.recarray((SIZE,SIZE),dtype=[('x', float), ('y', float), ('z',float)] )\n",
"RA.x = OA.x # copy the same random values into this\n",
"RA.y = OA.y\n",
"RA.z = OA.z\n",
"print(\"Created Record Array.\")\n",
"\n",
"\n",
"DF = pd.DataFrame(RA.view(dtype=float).reshape(SIZE*SIZE,3), columns=['x','y','z'] )\n",
"print(\"Created DataFrame.\")\n",
"\n",
"new_vals = np.random.random((SIZE,SIZE)) # a set of new values to use in writing functions"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10000 loops, best of 3: 23.9 µs per loop\n"
]
}
],
"source": [
"%timeit RA.x"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The slowest run took 73.32 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
"100000 loops, best of 3: 5.51 µs per loop\n"
]
}
],
"source": [
"%timeit DF.x"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The slowest run took 12.39 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
"1000000 loops, best of 3: 609 ns per loop\n"
]
}
],
"source": [
"%timeit OA.y"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 3: 1.31 s per loop\n"
]
}
],
"source": [
"%timeit OA.x"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 3: 7.3 s per loop\n"
]
}
],
"source": [
"%timeit OA.product()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Idiomatic approaches to product in a pandas DataFrame are much faster"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 loops, best of 3: 11.3 ms per loop\n"
]
}
],
"source": [
"%timeit DF.x * DF.y * DF.z"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 loops, best of 3: 65 ms per loop\n"
]
}
],
"source": [
"%timeit DF.product(axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Function application in pandas is a bit slower"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 3: 47.9 s per loop\n"
]
}
],
"source": [
"%timeit DF.apply(lambda x: x.x * x.y * x.z, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The slowest run took 8.34 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
"100 loops, best of 3: 3.17 ms per loop\n"
]
}
],
"source": [
"%timeit RA.x = new_vals"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"flat_vals = new_vals.flat"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"100 loops, best of 3: 2.79 ms per loop\n"
]
}
],
"source": [
"%timeit DF.x = flat_vals"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000 loops, best of 3: 1.2 ms per loop\n"
]
}
],
"source": [
"%timeit OA.y = new_vals"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 loop, best of 3: 6.06 s per loop\n"
]
}
],
"source": [
"%timeit buffered_product_methoded_product_method()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-----------------------\n",
"Reading functions:\n",
" OA_read_coupled: 8.749993867240846e-07\n",
" DF_read: 5.37899904884398e-06\n",
" RA_read: 9.922998287947848e-06\n",
" OA_read_ad_hoc: 1.2511964490004175\n",
" DTO_ndindex_read: 1.0806209040019894\n",
" DTO_for_loops_read: 0.36763668799903826\n",
" NL_for_loops_read: 0.26468918699902133\n",
"-----------------------\n",
"Writing functions:\n",
" OA_write_coupled: 0.0010680109990062192\n",
" DF_write: 0.008858509998390218\n",
" RA_write: 0.0023926989997562487\n",
" OA_write_ad_hoc: 0.83739389600305\n",
" DTO_ndindex_write: 1.1099856850014476\n",
" DTO_for_loops_write: 0.4235311080010433\n",
" NL_for_loops_write: 0.309092707997479\n",
"-----------------------\n",
"Calling functions:\n",
" OA_call_coupled: 5.461909172998276\n",
" DF_call: 52.99924017599915\n",
" RA_call: 22.008603191003203\n",
" OA_call_ad_hoc: 7.199348325000756\n",
" DTO_ndindex_call: 2.2280545459980203\n",
" DTO_for_loops_call: 1.3564103810022061\n",
" NL_for_loops_call: 1.2781733769988932\n",
"-----------------------\n",
"Dataframe Calling functions:\n",
" DF_call_raw_indexed: 1.1983546709998336\n",
"-----------------------\n",
"Command prompt now available.\n"
]
}
],
"source": [
"def RA_read(): return RA.x\n",
"def DF_read(): return DF.x\n",
"def OA_read_coupled(): return OA.y # .y was coupled\n",
"def OA_read_ad_hoc(): return OA.x # .x was not coupled\n",
"def DTO_ndindex_read():\n",
" out = np.empty((SIZE,SIZE), dtype=float)\n",
" for i in np.ndindex(SIZE,SIZE): out[i]=OA[i].x\n",
" return out\n",
"def DTO_for_loops_read():\n",
" out = np.empty((SIZE,SIZE), dtype=float)\n",
" for i in range(SIZE):\n",
" for j in range(SIZE): out[i,j]=OA[i,j].x\n",
" return out\n",
"def NL_for_loops_read():\n",
" out = np.empty((SIZE,SIZE), dtype=float)\n",
" for i, L in enumerate(nested_lists):\n",
" for j in range(SIZE): out[i,j]=L[j].x\n",
" return out\n",
"\n",
"def RA_write(): RA.x = new_vals\n",
"def DF_write(): DF.x = new_vals.flatten() # must flatten because dataframes limited to 2D\n",
"def OA_write_coupled(): OA.y = new_vals # .y was coupled\n",
"def OA_write_ad_hoc(): OA.x = new_vals # .x was not coupled\n",
"def DTO_ndindex_write():\n",
" for i in np.ndindex(SIZE,SIZE): OA[i].x=new_vals[i]\n",
"def DTO_for_loops_write():\n",
" for i in range(SIZE):\n",
" for j in range(SIZE): OA[i,j].x = new_vals[i,j]\n",
"def NL_for_loops_write():\n",
" for i, L in enumerate(nested_lists):\n",
" for j in range(SIZE): L[j].x=new_vals[i,j]\n",
"\n",
"def RA_call():\n",
" out = np.empty((SIZE,SIZE), dtype=float)\n",
" for i in range(SIZE):\n",
" for j in range(SIZE): out[i,j] = manual_product(RA[i,j])\n",
" return out\n",
"\n",
"def DF_call(): return DF.apply(manual_product, axis=1)\n",
"def DF_call_raw(): return DF.apply(manual_product, axis=1, raw=True)\n",
"# The preceding causes an error because when raw=True, apply passes mere numpy arrays\n",
"# in as the argument, and those do not have .x, .y, and .z magic methods defined,\n",
"# so when manual_product attempts to refer to obj.x this causes an attribute error.\n",
"# Instead the following works instead, at the cost of needing the method be one\n",
"# that refers to attributes by index rather than by dot-idiom.\n",
"def DF_call_raw_indexed(): return DF.apply(manual_product_by_index, axis=1, raw=True)\n",
"\n",
"def OA_call_coupled(): return buffered_product_method() # an objarray of the product methods of OA's objs\n",
"def OA_call_ad_hoc(): return OA.product()\n",
"def DTO_ndindex_call():\n",
" out = np.empty((SIZE,SIZE), dtype=float)\n",
" for i in np.ndindex(SIZE,SIZE): out[i]=OA[i].product()\n",
" return out\n",
"def DTO_for_loops_call():\n",
" out = np.empty((SIZE,SIZE), dtype=float)\n",
" for i in range(SIZE):\n",
" for j in range(SIZE): out[i,j] = OA[i,j].product()\n",
" return out\n",
"def NL_for_loops_call():\n",
" out = np.empty((SIZE,SIZE), dtype=float)\n",
" for i, L in enumerate(nested_lists):\n",
" for j in range(SIZE): out[i,j]=L[j].product()\n",
" return out\n",
"\n",
"reading_functions = [OA_read_coupled, DF_read, RA_read, OA_read_ad_hoc,\n",
" DTO_ndindex_read, DTO_for_loops_read, NL_for_loops_read]\n",
"writing_functions = [OA_write_coupled, DF_write, RA_write, OA_write_ad_hoc,\n",
" DTO_ndindex_write, DTO_for_loops_write, NL_for_loops_write]\n",
"calling_functions = [OA_call_coupled, DF_call, RA_call, OA_call_ad_hoc,\n",
" DTO_ndindex_call, DTO_for_loops_call, NL_for_loops_call]\n",
"df_calling_functions = [DF_call_raw_indexed]\n",
"\n",
"def print_min_times_for_functions(functions, reps = 10, numb = 1):\n",
" \"\"\" For each listed function, prints min time needed (out of reps tries) to run that\n",
" function numb times in a row.\n",
" :param functions: any iterable structure of things callable by timeit\n",
" :param reps: number of seperate trials to run, minimum trial length will be reported\n",
" :param numb: number of times to call a function per trial repitition\n",
" :return: the results are printed, nothing is returned\n",
" \"\"\"\n",
" for f in functions:\n",
" print(' '+(f.__name__)+':', min(timeit.repeat(f, repeat=reps, number=numb)))\n",
"\n",
"\n",
"print(\"-----------------------\")\n",
"print(\"Reading functions:\")\n",
"print_min_times_for_functions(reading_functions)\n",
"\n",
"print(\"-----------------------\")\n",
"print(\"Writing functions:\")\n",
"print_min_times_for_functions(writing_functions)\n",
"\n",
"print(\"-----------------------\")\n",
"print(\"Calling functions:\")\n",
"print_min_times_for_functions(calling_functions)\n",
"\n",
"\n",
"print(\"-----------------------\")\n",
"print(\"Dataframe Calling functions:\")\n",
"print_min_times_for_functions(df_calling_functions)\n",
"\n",
"\n",
"print(\"-----------------------\")\n",
"print(\"Command prompt now available.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment