Skip to content

Instantly share code, notes, and snippets.

@jcrist
Created August 16, 2019 15:39
Show Gist options
  • Save jcrist/d9f6e6ec35a655348459ccbb26811350 to your computer and use it in GitHub Desktop.
Save jcrist/d9f6e6ec35a655348459ccbb26811350 to your computer and use it in GitHub Desktop.
Benchmarking matmul implementations on small matrices
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Matrix Multiplications"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"a = np.random.normal(size=(3, 3))\n",
"b = np.random.normal(size=(3, 3))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.61785409, -0.43972078, 0.52660514],\n",
" [ 0.21376266, 0.17617051, 2.8461124 ],\n",
" [ 0.19882498, -2.43285968, 3.2805902 ]])"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.dot(b)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Benchmarking"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from timeit import default_timer\n",
"\n",
"def make_bench(method):\n",
" @nb.jit(nopython=True, fastmath=True)\n",
" def inner(a, b, out, n):\n",
" for _ in range(n):\n",
" method(a, b, out)\n",
" \n",
" def bench(a, b, out, n=1000000):\n",
" # Precompile\n",
" inner(a, b, out, 100)\n",
" \n",
" start = default_timer()\n",
" inner(a, b, out, n)\n",
" end = default_timer()\n",
" return (end - start) / n\n",
" \n",
" return bench"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import numba as nb"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"@nb.jit(nopython=True, fastmath=True)\n",
"def blas_matmul(a, b, out):\n",
" return np.dot(a, b, out)\n",
"\n",
"bench_blas = make_bench(blas_matmul)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"236.9579539999993"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out = np.zeros((3, 3))\n",
"\n",
"bench_blas(a, b, out) * 1e9"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Naive implementation"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import numba as nb\n",
"\n",
"def naive_base(a, b, out):\n",
" m = a.shape[1]\n",
" assert b.shape[0] == m\n",
" \n",
" for i in range(a.shape[0]):\n",
" for j in range(b.shape[1]):\n",
" t = 0\n",
" for k in range(m):\n",
" t += a[i, k] * b[k, j]\n",
" out[i, j] = t\n",
" \n",
"naive = nb.jit(nopython=True)(naive_base)\n",
"naive_fastmath = nb.jit(nopython=True, fastmath=True)(naive_base)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.61785409, -0.43972078, 0.52660514],\n",
" [ 0.21376266, 0.17617051, 2.8461124 ],\n",
" [ 0.19882498, -2.43285968, 3.2805902 ]])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out = np.zeros((3, 3))\n",
"\n",
"naive(a, b, out)\n",
"\n",
"out"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"bench_naive = make_bench(naive)\n",
"bench_naive_fastmath = make_bench(naive_fastmath)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"27.11377199999987"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bench_naive(a, b, out) * 1e9"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Specialized Implementation"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"\n",
"\n",
"def get_filename(_iter=itertools.count()):\n",
" return '<numba-generated-%d>' % next(_iter)\n",
"\n",
"\n",
"def generate_matmul_code(a_shape, b_shape):\n",
" n, m = a_shape\n",
" assert b_shape[0] == m\n",
" p = b_shape[1]\n",
"\n",
" lines = [f\"def matmul_{n}x{m}_{m}x{p}(a, b, out):\"]\n",
" for i in range(n):\n",
" for j in range(p):\n",
" expr = \" + \".join(f\"a[{i},{k}] * b[{k},{j}]\" for k in range(m))\n",
" lines.append(f\" out[{i},{j}] = {expr}\")\n",
" return \"\\n\".join(lines)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"def matmul_3x3_3x3(a, b, out):\n",
" out[0,0] = a[0,0] * b[0,0] + a[0,1] * b[1,0] + a[0,2] * b[2,0]\n",
" out[0,1] = a[0,0] * b[0,1] + a[0,1] * b[1,1] + a[0,2] * b[2,1]\n",
" out[0,2] = a[0,0] * b[0,2] + a[0,1] * b[1,2] + a[0,2] * b[2,2]\n",
" out[1,0] = a[1,0] * b[0,0] + a[1,1] * b[1,0] + a[1,2] * b[2,0]\n",
" out[1,1] = a[1,0] * b[0,1] + a[1,1] * b[1,1] + a[1,2] * b[2,1]\n",
" out[1,2] = a[1,0] * b[0,2] + a[1,1] * b[1,2] + a[1,2] * b[2,2]\n",
" out[2,0] = a[2,0] * b[0,0] + a[2,1] * b[1,0] + a[2,2] * b[2,0]\n",
" out[2,1] = a[2,0] * b[0,1] + a[2,1] * b[1,1] + a[2,2] * b[2,1]\n",
" out[2,2] = a[2,0] * b[0,2] + a[2,1] * b[1,2] + a[2,2] * b[2,2]\n"
]
}
],
"source": [
"print(generate_matmul_code((3, 3), (3, 3)))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"def generate_matmul(a_shape, b_shape):\n",
" code = generate_matmul_code(a_shape, b_shape)\n",
" namespace = {}\n",
" funclocals = {}\n",
" exec(compile(code, get_filename(), 'exec'), namespace, funclocals)\n",
" func = list(funclocals.values())[0]\n",
" return nb.jit(nopython=True)(func), nb.jit(nopython=True, fastmath=True)(func)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"specialized, specialized_fastmath = generate_matmul((3, 3), (3, 3))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-0.61785409, -0.43972078, 0.52660514],\n",
" [ 0.21376266, 0.17617051, 2.8461124 ],\n",
" [ 0.19882498, -2.43285968, 3.2805902 ]])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out = np.zeros((3, 3))\n",
"\n",
"specialized(a, b, out)\n",
"\n",
"out"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"bench_specialized = make_bench(specialized)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"18.81342850000003"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bench_specialized(a, b, out, 10000000) * 1e9"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Comparison across size"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"def make_args(n):\n",
" a = np.random.normal(size=(n, n)).astype('f4')\n",
" b = np.random.normal(size=(n, n)).astype('f4')\n",
" out = np.empty((n, n), dtype='f4')\n",
" return a, b, out\n",
"\n",
"\n",
"def bench(start, end):\n",
" results = []\n",
" for n in range(start, end + 1):\n",
" print(\"n = %d\" % n)\n",
" a, b, out = make_args(n)\n",
" \n",
" specialized, specialized_fastmath = generate_matmul((n, n), (n, n))\n",
" bench_specialized = make_bench(specialized)\n",
" bench_specialized_fastmath = make_bench(specialized_fastmath)\n",
"\n",
" \n",
" if n <= 3:\n",
" rounds = 10_000_000\n",
" elif n <= 5:\n",
" rounds = 1_000_000\n",
" elif n <= 8:\n",
" rounds = 500_000\n",
" else:\n",
" rounds = 50_000\n",
"\n",
" t1 = bench_blas(a, b, out, rounds)\n",
" t2 = bench_naive(a, b, out, rounds)\n",
" t3 = bench_naive_fastmath(a, b, out, rounds)\n",
" t4 = bench_specialized(a, b, out, rounds)\n",
" t5 = bench_specialized_fastmath(a, b, out, rounds)\n",
" \n",
" results.append((n, t1, t2, t3, t4, t5))\n",
" \n",
" return pd.DataFrame.from_records(\n",
" results,\n",
" columns=[\"n\", \"OpenBlas\", \"Naive\", \"Naive Fastmath\", \"Specialized\", \"Specialized Fastmath\"]\n",
" ).set_index(\"n\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"n = 2\n",
"n = 3\n",
"n = 4\n",
"n = 5\n",
"n = 6\n",
"n = 7\n",
"n = 8\n",
"n = 9\n",
"n = 10\n",
"n = 11\n",
"n = 12\n"
]
}
],
"source": [
"df = bench(2, 12)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x12c4e4518>"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"df.plot.line(title=\"Matmul timings\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x12e2ac080>"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"df['theory'] = 1e-9 * df.index ** 3\n",
"\n",
"df.plot.line(title=\"Matmul timings\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"System info:\n",
"--------------------------------------------------------------------------------\n",
"__Time Stamp__\n",
"2019-08-16 15:38:11.854266\n",
"\n",
"__Hardware Information__\n",
"Machine : x86_64\n",
"CPU Name : haswell\n",
"CPU count : 8\n",
"CPU Features : \n",
"64bit aes avx avx2 bmi bmi2 cmov cx16 f16c fma fsgsbase invpcid lzcnt mmx movbe\n",
"pclmul popcnt rdrnd rtm sahf sse sse2 sse3 sse4.1 sse4.2 ssse3 xsave xsaveopt\n",
"\n",
"__OS Information__\n",
"Platform : Darwin-17.2.0-x86_64-i386-64bit\n",
"Release : 17.2.0\n",
"System Name : Darwin\n",
"Version : Darwin Kernel Version 17.2.0: Fri Sep 29 18:27:05 PDT 2017; root:xnu-4570.20.62~3/RELEASE_X86_64\n",
"OS specific info : 10.13.1 x86_64\n",
"\n",
"__Python Information__\n",
"Python Compiler : Clang 4.0.1 (tags/RELEASE_401/final)\n",
"Python Implementation : CPython\n",
"Python Version : 3.7.3\n",
"Python Locale : en_US UTF-8\n",
"\n",
"__LLVM information__\n",
"LLVM version : 8.0.0\n",
"\n",
"__CUDA Information__\n",
"Error: CUDA device intialisation problem. Message:Error at driver init: \n",
"[3] Call to cuInit results in CUDA_ERROR_NOT_INITIALIZED:\n",
"Error class: <class 'numba.cuda.cudadrv.error.CudaSupportError'>\n",
"\n",
"__ROC Information__\n",
"ROC available : False\n",
"Error initialising ROC due to : No ROC toolchains found.\n",
"No HSA Agents found, encountered exception when searching:\n",
"Error at driver init: \n",
"\n",
"HSA is not currently supported on this platform (darwin).\n",
":\n",
"\n",
"__SVML Information__\n",
"SVML state, config.USING_SVML : False\n",
"SVML library found and loaded : False\n",
"llvmlite using SVML patched LLVM : True\n",
"SVML operational : False\n",
"\n",
"__Threading Layer Information__\n",
"TBB Threading layer available : False\n",
"+--> Disabled due to : Unknown import problem.\n",
"OpenMP Threading layer available : False\n",
"+--> Disabled due to : Unknown import problem.\n",
"Workqueue Threading layer available : True\n",
"\n",
"__Numba Environment Variable Information__\n",
"None set.\n",
"\n",
"__Conda Information__\n",
"conda_build_version : not installed\n",
"conda_env_version : 4.6.14\n",
"platform : osx-64\n",
"python_version : 3.7.3.final.0\n",
"root_writable : True\n",
"\n",
"__Current Conda Env__\n",
"appnope 0.1.0 py37_0 \n",
"attrs 19.1.0 py37_1 \n",
"backcall 0.1.0 py37_0 \n",
"bleach 3.1.0 py37_0 \n",
"blis 0.2.4 pypi_0 pypi\n",
"bokeh 1.3.4 pypi_0 pypi\n",
"bzip2 1.0.8 h01d97ff_0 conda-forge\n",
"ca-certificates 2019.5.15 1 \n",
"certifi 2019.6.16 py37_1 \n",
"cycler 0.10.0 py_1 conda-forge\n",
"dbus 1.13.6 h90a0687_0 \n",
"decorator 4.4.0 py37_1 \n",
"defusedxml 0.6.0 py_0 \n",
"entrypoints 0.3 py37_0 \n",
"expat 2.2.6 h0a44026_0 \n",
"fastcache 1.1.0 py37h01d97ff_0 conda-forge\n",
"flake8 3.7.7 py37_0 \n",
"freetype 2.10.0 h24853df_1 conda-forge\n",
"gettext 0.19.8.1 h15daf44_3 \n",
"glib 2.56.2 hd9629dc_0 \n",
"gmp 6.1.2 h0a44026_1000 conda-forge\n",
"gmpy2 2.1.0b1 py37h1744cca_0 conda-forge\n",
"icu 58.2 h4b95b61_1 \n",
"ipykernel 5.1.1 py37h39e3cac_0 \n",
"ipython 7.7.0 py37h39e3cac_0 \n",
"ipython_genutils 0.2.0 py37_0 \n",
"ipywidgets 7.5.1 py_0 \n",
"jedi 0.13.3 py37_0 \n",
"jinja2 2.10.1 py37_0 \n",
"jpeg 9b he5867d9_2 \n",
"jsonschema 3.0.1 py37_0 \n",
"jupyter 1.0.0 py37_7 \n",
"jupyter_client 5.3.1 py_0 \n",
"jupyter_console 6.0.0 py37_0 \n",
"jupyter_core 4.5.0 py_0 \n",
"kiwisolver 1.1.0 py37h770b8ee_0 conda-forge\n",
"libblas 3.8.0 12_openblas conda-forge\n",
"libcblas 3.8.0 12_openblas conda-forge\n",
"libcxx 8.0.0 4 conda-forge\n",
"libcxxabi 8.0.0 4 conda-forge\n",
"libffi 3.2.1 h6de7cb9_1006 conda-forge\n",
"libgfortran 4.0.0 1 conda-forge\n",
"libiconv 1.15 hdd342a3_7 \n",
"liblapack 3.8.0 12_openblas conda-forge\n",
"libopenblas 0.3.7 h4bb4525_0 conda-forge\n",
"libpng 1.6.37 h2573ce8_0 conda-forge\n",
"libsodium 1.0.16 h3efe00b_0 \n",
"llvm-openmp 8.0.0 h770b8ee_1 conda-forge\n",
"llvmlite 0.29.0 py37h3fea490_1 conda-forge\n",
"markupsafe 1.1.1 py37h1de35cc_0 \n",
"matplotlib 3.1.1 py37_1 conda-forge\n",
"matplotlib-base 3.1.1 py37h3a684a6_1 conda-forge\n",
"mccabe 0.6.1 py37_1 \n",
"mistune 0.8.4 py37h1de35cc_0 \n",
"mpc 1.1.0 h3aece09_1006 conda-forge\n",
"mpfr 4.0.2 h0a95893_0 conda-forge\n",
"mpmath 1.1.0 py_0 conda-forge\n",
"nbconvert 5.5.0 py_0 \n",
"nbformat 4.4.0 py37_0 \n",
"ncurses 6.1 h0a44026_1002 conda-forge\n",
"notebook 6.0.0 py37_0 \n",
"numba 0.45.1 py37h86efe34_0 conda-forge\n",
"numpy 1.17.0 py37h6b0580a_0 conda-forge\n",
"openssl 1.1.1c h1de35cc_1 \n",
"packaging 19.1 pypi_0 pypi\n",
"pandas 0.25.0 py37h86efe34_0 conda-forge\n",
"pandoc 2.2.3.2 0 \n",
"pandocfilters 1.4.2 py37_1 \n",
"parso 0.5.0 py_0 \n",
"pcre 8.43 h0a44026_0 \n",
"pexpect 4.7.0 py37_0 \n",
"pickleshare 0.7.5 py37_0 \n",
"pillow 6.1.0 pypi_0 pypi\n",
"pip 19.2.2 py37_0 conda-forge\n",
"prometheus_client 0.7.1 py_0 \n",
"prompt_toolkit 2.0.9 py37_0 \n",
"ptyprocess 0.6.0 py37_0 \n",
"pycodestyle 2.5.0 py37_0 \n",
"pyflakes 2.1.1 py37_0 \n",
"pygments 2.4.2 py_0 \n",
"pyparsing 2.4.2 py_0 conda-forge\n",
"pyqt 5.9.2 py37h655552a_2 \n",
"pyrsistent 0.14.11 py37h1de35cc_0 \n",
"python 3.7.3 h93065d6_1 conda-forge\n",
"python-dateutil 2.8.0 py_0 conda-forge\n",
"pytz 2019.2 py_0 conda-forge\n",
"pyyaml 5.1.2 pypi_0 pypi\n",
"pyzmq 18.0.0 py37h0a44026_0 \n",
"qt 5.9.7 h468cd18_1 \n",
"qtconsole 4.5.2 py_0 \n",
"readline 8.0 hcfe32e1_0 conda-forge\n",
"scipy 1.3.1 py37h7e0e109_1 conda-forge\n",
"send2trash 1.5.0 py37_0 \n",
"setuptools 41.0.1 py37_0 conda-forge\n",
"sip 4.19.8 py37h0a44026_0 \n",
"six 1.12.0 py37_0 \n",
"sqlite 3.29.0 hb7d70f7_0 conda-forge\n",
"sympy 1.4 py37_0 conda-forge\n",
"terminado 0.8.2 py37_0 \n",
"testpath 0.4.2 py37_0 \n",
"tk 8.6.9 h2573ce8_1002 conda-forge\n",
"tornado 6.0.3 py37h01d97ff_0 conda-forge\n",
"traitlets 4.3.2 py37_0 \n",
"wcwidth 0.1.7 py37_0 \n",
"webencodings 0.5.1 py37_1 \n",
"wheel 0.33.4 py37_0 conda-forge\n",
"widgetsnbextension 3.5.0 py37_0 \n",
"xz 5.2.4 h1de35cc_1001 conda-forge\n",
"zeromq 4.3.1 h0a44026_3 \n",
"zlib 1.2.11 h01d97ff_1005 conda-forge\n",
"--------------------------------------------------------------------------------\n",
"If requested, please copy and paste the information between\n",
"the dashed (----) lines, or from a given specific section as\n",
"appropriate.\n",
"\n",
"=============================================================\n",
"IMPORTANT: Please ensure that you are happy with sharing the\n",
"contents of the information present, any information that you\n",
"wish to keep private you should remove before sharing.\n",
"=============================================================\n",
"\n"
]
}
],
"source": [
"!numba -s"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment