Last active
August 12, 2016 17:09
-
-
Save davclark/b6ab9ef9ad92cb74bc25c367e0752780 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from objarray import *\n", | |
"\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import timeit, random\n", | |
"\n", | |
"class TestObject(object):\n", | |
" \"\"\" A simple class of object for use in benchmark tests.\n", | |
" \"\"\"\n", | |
" def __init__(self):\n", | |
" \"\"\"create an object with random .x,.y and .z attributes\"\"\"\n", | |
" self.x = random.random()\n", | |
" self.y = random.random()\n", | |
" self.z = random.random()\n", | |
"\n", | |
" def product(self): return self.x * self.y * self.z\n", | |
"\n", | |
"# Since record arrays' and dataframes' analogs of objects can't have methods of their own,\n", | |
"# the following function serves an equivalent purpose, to the product method defined above.\n", | |
"def manual_product(obj): return obj.x * obj.y * obj.z\n", | |
"\n", | |
"# The above manual product function requires that the obj passed to it be some sort of\n", | |
"# object with magic methods assinging meanings to .x, .y and .z. Pandas apply documentation\n", | |
"# claims to have faster performance if you instead have it pass mere NumPy arrays as\n", | |
"# the object-like argument to the applied method. This comes at the inconvenience of\n", | |
"# not being able to use dot-idiom attribute-reference in your OOP code. The following\n", | |
"# method is used to see how much speed one can gain at cost of this inconvenience.\n", | |
"def manual_product_by_index(obj): return obj[0] * obj[1] * obj[2]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Created objects.\n", | |
"Created ObjArray.\n", | |
"Created coupled ObjArray.\n", | |
"Created buffered method to call.\n", | |
"Created Record Array.\n", | |
"Created DataFrame.\n" | |
] | |
} | |
], | |
"source": [ | |
"SIZE = 1000 # We will generate a SIZE x SIZE array of objects\n", | |
"\n", | |
"nested_lists = []\n", | |
"for i in range(SIZE):\n", | |
" row = []\n", | |
" for _ in range(SIZE): row.append(TestObject()) # append SIZE test objects into row\n", | |
" nested_lists.append(row)\n", | |
"print(\"Created objects.\")\n", | |
"\n", | |
"OA = ObjArray(nested_lists, attributes=('x','y','z') )\n", | |
"print(\"Created ObjArray.\")\n", | |
"\n", | |
"OA.new_coupled_buffer('y') # will leave x uncoupled for testing it\n", | |
"print(\"Created coupled ObjArray.\")\n", | |
"\n", | |
"buffered_product_method = OA.product # a new objarray containing the product methods of OA's objects\n", | |
"print(\"Created buffered method to call.\")\n", | |
"\n", | |
"\n", | |
"RA = np.recarray((SIZE,SIZE),dtype=[('x', float), ('y', float), ('z',float)] )\n", | |
"RA.x = OA.x # copy the same random values into this\n", | |
"RA.y = OA.y\n", | |
"RA.z = OA.z\n", | |
"print(\"Created Record Array.\")\n", | |
"\n", | |
"\n", | |
"DF = pd.DataFrame(RA.view(dtype=float).reshape(SIZE*SIZE,3), columns=['x','y','z'] )\n", | |
"print(\"Created DataFrame.\")\n", | |
"\n", | |
"new_vals = np.random.random((SIZE,SIZE)) # a set of new values to use in writing functions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10000 loops, best of 3: 23.9 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit RA.x" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The slowest run took 73.32 times longer than the fastest. This could mean that an intermediate result is being cached.\n", | |
"100000 loops, best of 3: 5.51 µs per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit DF.x" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The slowest run took 12.39 times longer than the fastest. This could mean that an intermediate result is being cached.\n", | |
"1000000 loops, best of 3: 609 ns per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit OA.y" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 3: 1.31 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit OA.x" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 3: 7.3 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit OA.product()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Idiomatic approaches to product in a pandas DataFrame are much faster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100 loops, best of 3: 11.3 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit DF.x * DF.y * DF.z" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 65 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit DF.product(axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Function application in pandas is a bit slower" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 3: 47.9 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit DF.apply(lambda x: x.x * x.y * x.z, axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The slowest run took 8.34 times longer than the fastest. This could mean that an intermediate result is being cached.\n", | |
"100 loops, best of 3: 3.17 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit RA.x = new_vals" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"flat_vals = new_vals.flat" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100 loops, best of 3: 2.79 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit DF.x = flat_vals" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1000 loops, best of 3: 1.2 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit OA.y = new_vals" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1 loop, best of 3: 6.06 s per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit buffered_product_methoded_product_method()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"-----------------------\n", | |
"Reading functions:\n", | |
" OA_read_coupled: 8.749993867240846e-07\n", | |
" DF_read: 5.37899904884398e-06\n", | |
" RA_read: 9.922998287947848e-06\n", | |
" OA_read_ad_hoc: 1.2511964490004175\n", | |
" DTO_ndindex_read: 1.0806209040019894\n", | |
" DTO_for_loops_read: 0.36763668799903826\n", | |
" NL_for_loops_read: 0.26468918699902133\n", | |
"-----------------------\n", | |
"Writing functions:\n", | |
" OA_write_coupled: 0.0010680109990062192\n", | |
" DF_write: 0.008858509998390218\n", | |
" RA_write: 0.0023926989997562487\n", | |
" OA_write_ad_hoc: 0.83739389600305\n", | |
" DTO_ndindex_write: 1.1099856850014476\n", | |
" DTO_for_loops_write: 0.4235311080010433\n", | |
" NL_for_loops_write: 0.309092707997479\n", | |
"-----------------------\n", | |
"Calling functions:\n", | |
" OA_call_coupled: 5.461909172998276\n", | |
" DF_call: 52.99924017599915\n", | |
" RA_call: 22.008603191003203\n", | |
" OA_call_ad_hoc: 7.199348325000756\n", | |
" DTO_ndindex_call: 2.2280545459980203\n", | |
" DTO_for_loops_call: 1.3564103810022061\n", | |
" NL_for_loops_call: 1.2781733769988932\n", | |
"-----------------------\n", | |
"Dataframe Calling functions:\n", | |
" DF_call_raw_indexed: 1.1983546709998336\n", | |
"-----------------------\n", | |
"Command prompt now available.\n" | |
] | |
} | |
], | |
"source": [ | |
"def RA_read(): return RA.x\n", | |
"def DF_read(): return DF.x\n", | |
"def OA_read_coupled(): return OA.y # .y was coupled\n", | |
"def OA_read_ad_hoc(): return OA.x # .x was not coupled\n", | |
"def DTO_ndindex_read():\n", | |
" out = np.empty((SIZE,SIZE), dtype=float)\n", | |
" for i in np.ndindex(SIZE,SIZE): out[i]=OA[i].x\n", | |
" return out\n", | |
"def DTO_for_loops_read():\n", | |
" out = np.empty((SIZE,SIZE), dtype=float)\n", | |
" for i in range(SIZE):\n", | |
" for j in range(SIZE): out[i,j]=OA[i,j].x\n", | |
" return out\n", | |
"def NL_for_loops_read():\n", | |
" out = np.empty((SIZE,SIZE), dtype=float)\n", | |
" for i, L in enumerate(nested_lists):\n", | |
" for j in range(SIZE): out[i,j]=L[j].x\n", | |
" return out\n", | |
"\n", | |
"def RA_write(): RA.x = new_vals\n", | |
"def DF_write(): DF.x = new_vals.flatten() # must flatten because dataframes limited to 2D\n", | |
"def OA_write_coupled(): OA.y = new_vals # .y was coupled\n", | |
"def OA_write_ad_hoc(): OA.x = new_vals # .x was not coupled\n", | |
"def DTO_ndindex_write():\n", | |
" for i in np.ndindex(SIZE,SIZE): OA[i].x=new_vals[i]\n", | |
"def DTO_for_loops_write():\n", | |
" for i in range(SIZE):\n", | |
" for j in range(SIZE): OA[i,j].x = new_vals[i,j]\n", | |
"def NL_for_loops_write():\n", | |
" for i, L in enumerate(nested_lists):\n", | |
" for j in range(SIZE): L[j].x=new_vals[i,j]\n", | |
"\n", | |
"def RA_call():\n", | |
" out = np.empty((SIZE,SIZE), dtype=float)\n", | |
" for i in range(SIZE):\n", | |
" for j in range(SIZE): out[i,j] = manual_product(RA[i,j])\n", | |
" return out\n", | |
"\n", | |
"def DF_call(): return DF.apply(manual_product, axis=1)\n", | |
"def DF_call_raw(): return DF.apply(manual_product, axis=1, raw=True)\n", | |
"# The preceding causes an error because when raw=True, apply passes mere numpy arrays\n", | |
"# in as the argument, and those do not have .x, .y, and .z magic methods defined,\n", | |
"# so when manual_product attempts to refer to obj.x this causes an attribute error.\n", | |
"# Instead the following works instead, at the cost of needing the method be one\n", | |
"# that refers to attributes by index rather than by dot-idiom.\n", | |
"def DF_call_raw_indexed(): return DF.apply(manual_product_by_index, axis=1, raw=True)\n", | |
"\n", | |
"def OA_call_coupled(): return buffered_product_method() # an objarray of the product methods of OA's objs\n", | |
"def OA_call_ad_hoc(): return OA.product()\n", | |
"def DTO_ndindex_call():\n", | |
" out = np.empty((SIZE,SIZE), dtype=float)\n", | |
" for i in np.ndindex(SIZE,SIZE): out[i]=OA[i].product()\n", | |
" return out\n", | |
"def DTO_for_loops_call():\n", | |
" out = np.empty((SIZE,SIZE), dtype=float)\n", | |
" for i in range(SIZE):\n", | |
" for j in range(SIZE): out[i,j] = OA[i,j].product()\n", | |
" return out\n", | |
"def NL_for_loops_call():\n", | |
" out = np.empty((SIZE,SIZE), dtype=float)\n", | |
" for i, L in enumerate(nested_lists):\n", | |
" for j in range(SIZE): out[i,j]=L[j].product()\n", | |
" return out\n", | |
"\n", | |
"reading_functions = [OA_read_coupled, DF_read, RA_read, OA_read_ad_hoc,\n", | |
" DTO_ndindex_read, DTO_for_loops_read, NL_for_loops_read]\n", | |
"writing_functions = [OA_write_coupled, DF_write, RA_write, OA_write_ad_hoc,\n", | |
" DTO_ndindex_write, DTO_for_loops_write, NL_for_loops_write]\n", | |
"calling_functions = [OA_call_coupled, DF_call, RA_call, OA_call_ad_hoc,\n", | |
" DTO_ndindex_call, DTO_for_loops_call, NL_for_loops_call]\n", | |
"df_calling_functions = [DF_call_raw_indexed]\n", | |
"\n", | |
"def print_min_times_for_functions(functions, reps = 10, numb = 1):\n", | |
" \"\"\" For each listed function, prints min time needed (out of reps tries) to run that\n", | |
" function numb times in a row.\n", | |
" :param functions: any iterable structure of things callable by timeit\n", | |
" :param reps: number of seperate trials to run, minimum trial length will be reported\n", | |
" :param numb: number of times to call a function per trial repitition\n", | |
" :return: the results are printed, nothing is returned\n", | |
" \"\"\"\n", | |
" for f in functions:\n", | |
" print(' '+(f.__name__)+':', min(timeit.repeat(f, repeat=reps, number=numb)))\n", | |
"\n", | |
"\n", | |
"print(\"-----------------------\")\n", | |
"print(\"Reading functions:\")\n", | |
"print_min_times_for_functions(reading_functions)\n", | |
"\n", | |
"print(\"-----------------------\")\n", | |
"print(\"Writing functions:\")\n", | |
"print_min_times_for_functions(writing_functions)\n", | |
"\n", | |
"print(\"-----------------------\")\n", | |
"print(\"Calling functions:\")\n", | |
"print_min_times_for_functions(calling_functions)\n", | |
"\n", | |
"\n", | |
"print(\"-----------------------\")\n", | |
"print(\"Dataframe Calling functions:\")\n", | |
"print_min_times_for_functions(df_calling_functions)\n", | |
"\n", | |
"\n", | |
"print(\"-----------------------\")\n", | |
"print(\"Command prompt now available.\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment