Forked from wusixer/ufunc_a_python_func.ipynb
Last active
September 29, 2021 01:03
-
-
Save alokito/12ba5bdd7dfae302d75aeeffc8d75a63 to your computer and use it in GitHub Desktop.
Vectorize a python function to speed up computation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "b3807586-479a-4f66-8ecf-524ab66a8f75", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "7f3832dd-d4fb-4930-b13e-2f17f461d6ed", | |
"metadata": {}, | |
"source": [ | |
"#### Today I learned how to pack a python function into numpy ufunc for boardcasting and vectorization to speed up the computation" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "aa72ee17-3dd0-4061-ba74-b816a378c2c7", | |
"metadata": {}, | |
"source": [ | |
"Let's say we have a python function that works on one input, we want to iterate this function over the entire dataset row-wise. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "fde37878-a0dd-40ca-8873-5f6a0f94d7c9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def pad_str_get_list(input_str:str, out_len: int) ->list:\n", | |
" \"\"\"\n", | |
" Pad an input string with 'X' to out_len and then return a list\n", | |
" \n", | |
" :param input_str: one input string, e.g 'ABC'\n", | |
" :param out_len: the targeted length after padding, an integer, such as 5\n", | |
" \n", | |
" return:\n", | |
" a list reflecting the padded output, such as ['A', 'B', 'C', 'X', 'X']\n", | |
" \n", | |
" \"\"\"\n", | |
" out_str = input_str + (out_len - len(input_str))* 'X'\n", | |
" return list(out_str)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "83e3b785-3544-4ea8-b6d0-2d4971b4db73", | |
"metadata": {}, | |
"source": [ | |
"Verify this function works." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "2b37a4ca-05c8-493d-a696-8953bdf2b28c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['A', 'B', 'C', 'D', 'E', 'X', 'X', 'X', 'X', 'X']" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pad_str_get_list('ABCDE', 10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e5cbd7da-bdd6-4b81-b493-91b403f899d8", | |
"metadata": {}, | |
"source": [ | |
"**First example**: on 1D array" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "12da9526-7844-4dfa-8a95-b600728b9659", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"example_1D = np.array(['apples', 'foobar', 'cowboy', 'banana', 'watermelon', 'pear'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "51ec8557-6ef1-4882-bc5c-0bf2898e0359", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['apples', 'foobar', 'cowboy', 'banana', 'watermelon', 'pear'],\n", | |
" dtype='<U10')" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"example_1D" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "0957f92a-aa6c-4331-b61c-f26926a47638", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(list,\n", | |
" [['a', 'p', 'p', 'l', 'e', 's', 'X', 'X', 'X', 'X'],\n", | |
" ['f', 'o', 'o', 'b', 'a', 'r', 'X', 'X', 'X', 'X'],\n", | |
" ['c', 'o', 'w', 'b', 'o', 'y', 'X', 'X', 'X', 'X'],\n", | |
" ['b', 'a', 'n', 'a', 'n', 'a', 'X', 'X', 'X', 'X'],\n", | |
" ['w', 'a', 't', 'e', 'r', 'm', 'e', 'l', 'o', 'n'],\n", | |
" ['p', 'e', 'a', 'r', 'X', 'X', 'X', 'X', 'X', 'X']])" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd_apply_out = pd.Series(example_1D).apply(lambda x: pad_str_get_list(x, 10)).tolist()\n", | |
"type(pd_apply_out), pd_apply_out" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "38b637c9-e8b8-46bd-b40f-edba4b78f852", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[['a', 'p', 'p', 'l', 'e', 's', 'X', 'X', 'X', 'X'],\n", | |
" ['f', 'o', 'o', 'b', 'a', 'r', 'X', 'X', 'X', 'X'],\n", | |
" ['c', 'o', 'w', 'b', 'o', 'y', 'X', 'X', 'X', 'X'],\n", | |
" ['b', 'a', 'n', 'a', 'n', 'a', 'X', 'X', 'X', 'X'],\n", | |
" ['w', 'a', 't', 'e', 'r', 'm', 'e', 'l', 'o', 'n'],\n", | |
" ['p', 'e', 'a', 'r', 'X', 'X', 'X', 'X', 'X', 'X']]" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vec_pad_str_get_list = np.frompyfunc(pad_str_get_list, 2,1)\n", | |
"vec_pad_str_get_list(example_1D, 10).tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "ebf4052c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['apples', 'foobar', 'cowboy', 'banana', 'watermelon', 'pear'],\n", | |
" dtype='<U10')" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"example_1D" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "2b610b38", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([4, 4, 4, 4, 0, 6])" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"length_checker = np.vectorize(len)\n", | |
"10-length_checker(example_1D)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "a69c9c7a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',\n", | |
" 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'], dtype='<U1')" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"np.repeat(np.repeat('X', len(example_1D)),(10 - length_checker(example_1D)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "b815e07f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def lc_pad_str_get_list(vec, out_len):\n", | |
" return [pad_str_get_list(el, out_len) for el in vec]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "1ae7f421-620f-429a-8357-c85ca4f0f741", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The pandas approach on 1d array takes ..\n", | |
"371 µs ± 30.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n", | |
"The vectorized approach on 1d array takes ..\n", | |
"7.14 µs ± 406 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n", | |
"A list comprehension takes ..\n", | |
"8.7 µs ± 732 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"print('The pandas approach on 1d array takes ..')\n", | |
"%timeit pd.Series(example_1D).apply(lambda x: pad_str_get_list(x, 10))\n", | |
"print('The vectorized approach on 1d array takes ..')\n", | |
"%timeit vec_pad_str_get_list(example_1D, 10).tolist()\n", | |
"print('A list comprehension takes ..')\n", | |
"%timeit lc_pad_str_get_list(example_1D, 10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "7990e783-8a22-422d-94d5-46a83f8e53d3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f17a042b-4e3d-477c-9cf7-3d173196564a", | |
"metadata": {}, | |
"source": [ | |
"**Second Example**: on 2D array" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "8195abe3-7487-4fa9-b90d-a72d1b442771", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"example_2D = np.array([['apples', 'foobar'], ['cowboy', 'banana'], ['watermelon', 'pear']])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "310f687a-f35d-45aa-8ff6-fb7e367eab01", | |
"metadata": {}, | |
"source": [ | |
"Vectorized result" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "70af5bfe-2f37-49d3-81f1-9e1992e655b1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[list(['a', 'p', 'p', 'l', 'e', 's', 'X', 'X', 'X', 'X']),\n", | |
" list(['f', 'o', 'o', 'b', 'a', 'r', 'X', 'X', 'X', 'X'])],\n", | |
" [list(['c', 'o', 'w', 'b', 'o', 'y', 'X', 'X', 'X', 'X']),\n", | |
" list(['b', 'a', 'n', 'a', 'n', 'a', 'X', 'X', 'X', 'X'])],\n", | |
" [list(['w', 'a', 't', 'e', 'r', 'm', 'e', 'l', 'o', 'n']),\n", | |
" list(['p', 'e', 'a', 'r', 'X', 'X', 'X', 'X', 'X', 'X'])]],\n", | |
" dtype=object)" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vec_pad_str_get_list(example_2D, 10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "8b56040d-c22a-402a-b577-0f501b892481", | |
"metadata": {}, | |
"source": [ | |
"Panda's approach - same as vectorized result" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "178a550e-6e63-435b-a676-e5336ca49ee9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[list(['a', 'p', 'p', 'l', 'e', 's', 'X', 'X', 'X', 'X']),\n", | |
" list(['f', 'o', 'o', 'b', 'a', 'r', 'X', 'X', 'X', 'X'])],\n", | |
" [list(['c', 'o', 'w', 'b', 'o', 'y', 'X', 'X', 'X', 'X']),\n", | |
" list(['b', 'a', 'n', 'a', 'n', 'a', 'X', 'X', 'X', 'X'])],\n", | |
" [list(['w', 'a', 't', 'e', 'r', 'm', 'e', 'l', 'o', 'n']),\n", | |
" list(['p', 'e', 'a', 'r', 'X', 'X', 'X', 'X', 'X', 'X'])]],\n", | |
" dtype=object)" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pd.DataFrame(example_2D).applymap(lambda x: pad_str_get_list(x, 10)).to_numpy()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "30142be2-099e-4a09-8922-f610d6cd76fc", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The pandas approach on 2d array takes ..\n", | |
"1.43 ms ± 273 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n", | |
"The vectorized approach on 2d array takes ..\n", | |
"10.2 µs ± 4.36 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"print('The pandas approach on 2d array takes ..')\n", | |
"%timeit pd.DataFrame(example_2D).applymap(lambda x: pad_str_get_list(x, 10)).to_numpy()\n", | |
"\n", | |
"print('The vectorized approach on 2d array takes ..')\n", | |
"%timeit vec_pad_str_get_list(example_2D, 10)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "a995be23-231e-4e97-a668-e3c9c4643a6f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "ufunc", | |
"language": "python", | |
"name": "ufunc" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment