jpivarski/xsimd.ipynb

## xsimd.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext Cython"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%cython --cplus -c-std=c++11 -c-O3 -I/home/ubuntu/xsimd/include -c-march=native\n",
    "\n",
    "from libc.stdint cimport int32_t\n",
    "\n",
    "cdef extern from \"xsimd/types/xsimd_avx512_int32.hpp\":\n",
    "    pass\n",
    "\n",
    "cdef extern from \"xsimd/xsimd.hpp\":\n",
    "    \"\"\"\n",
    "    void inplace_cumsum_int32(int32_t* data, size_t numitems) {\n",
    "        xsimd::batch<int32_t, 16> step1({0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14});\n",
    "        unsigned int mask1 = 65534;\n",
    "\n",
    "        xsimd::batch<int32_t, 16> step2({0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13});\n",
    "        unsigned int mask2 = 65532;\n",
    "\n",
    "        xsimd::batch<int32_t, 16> step3({0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11});\n",
    "        unsigned int mask3 = 65520;\n",
    "\n",
    "        xsimd::batch<int32_t, 16> step4({0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7});\n",
    "        unsigned int mask4 = 65280;\n",
    "\n",
    "        xsimd::batch<int32_t, 16> scatter({15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15});\n",
    "        \n",
    "        xsimd::batch<int32_t, 16> v;\n",
    "        xsimd::batch<int32_t, 16> carry({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});\n",
    "        \n",
    "        for (size_t i = 0;  i < numitems;  i += 16) {\n",
    "            v.load_unaligned(&data[i]);\n",
    "            \n",
    "            v += _mm512_maskz_permutexvar_epi32(mask1, step1, v);\n",
    "            v += _mm512_maskz_permutexvar_epi32(mask2, step2, v);\n",
    "            v += _mm512_maskz_permutexvar_epi32(mask3, step3, v);\n",
    "            v += _mm512_maskz_permutexvar_epi32(mask4, step4, v);\n",
    "            v += carry;\n",
    "            \n",
    "            carry = _mm512_permutexvar_epi32(scatter, v);\n",
    "            \n",
    "            v.store_unaligned(&data[i]);\n",
    "        }\n",
    "    }\n",
    "    \"\"\"\n",
    "    void inplace_cumsum_int32(int32_t* data, size_t numitems)\n",
    "\n",
    "import numpy\n",
    "\n",
    "def cumsum(inarray, outarray=None):\n",
    "    if not isinstance(inarray, numpy.ndarray) or not inarray.dtype == numpy.dtype(numpy.int32) or len(inarray.shape) != 1:\n",
    "        raise TypeError(\"inarray must be a one-dimensional array of type int32\")\n",
    "    if outarray is None:\n",
    "        outarray = numpy.empty(len(inarray), dtype=inarray.dtype)\n",
    "    if not isinstance(outarray, numpy.ndarray) or not outarray.dtype == numpy.dtype(numpy.int32) or len(outarray.shape) != 1:\n",
    "        raise TypeError(\"outarray must be a one-dimensional array of type int32\")\n",
    "    if inarray is not outarray:\n",
    "        outarray[:] = inarray\n",
    "    inplace_cumsum_int32(<int32_t*>(<size_t>outarray.ctypes.data), <size_t>len(outarray))\n",
    "    return outarray"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Numpy      0.7051286697387695\n",
      "vectorized 0.06996774673461914\n",
      "\n",
      "Numpy      0.6951799392700195\n",
      "vectorized 0.06896781921386719\n",
      "\n",
      "Numpy      0.6997087001800537\n",
      "vectorized 0.06942558288574219\n",
      "\n",
      "Numpy      0.7024240493774414\n",
      "vectorized 0.06833982467651367\n",
      "\n",
      "Numpy      0.6963632106781006\n",
      "vectorized 0.06911921501159668\n",
      "\n",
      "Numpy      0.6976268291473389\n",
      "vectorized 0.0697774887084961\n",
      "\n",
      "Numpy      0.6992297172546387\n",
      "vectorized 0.06891417503356934\n",
      "\n",
      "Numpy      0.700272798538208\n",
      "vectorized 0.06902408599853516\n",
      "\n",
      "Numpy      0.7043983936309814\n",
      "vectorized 0.06905174255371094\n",
      "\n",
      "Numpy      0.6968092918395996\n",
      "vectorized 0.06934857368469238\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "a = numpy.random.randint(-100, 100, 160000000, dtype=numpy.int32)\n",
    "\n",
    "for i in range(10):\n",
    "    startTime = time.time()\n",
    "    numpy.cumsum(a, out=a)\n",
    "    endTime = time.time()\n",
    "    print(\"Numpy     \", endTime - startTime)\n",
    "\n",
    "    startTime = time.time()\n",
    "    cumsum(a, a)\n",
    "    endTime = time.time()\n",
    "    print(\"vectorized\", endTime - startTime)\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tested on a AWS c3.large, which has AVX-512 instructions (Skylake). Standard conda installation of Numpy.\n",
    "\n",
    "\n",
    "```\n",
    "cat /proc/cpuinfo\n",
    "processor       : 0\n",
    "vendor_id       : GenuineIntel\n",
    "cpu family      : 6\n",
    "model           : 85\n",
    "model name      : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz\n",
    "stepping        : 3\n",
    "microcode       : 0x100013e\n",
    "cpu MHz         : 3000.002\n",
    "cache size      : 25344 KB\n",
    "physical id     : 0\n",
    "siblings        : 2\n",
    "core id         : 0\n",
    "cpu cores       : 1\n",
    "apicid          : 0\n",
    "initial apicid  : 0\n",
    "fpu             : yes\n",
    "fpu_exception   : yes\n",
    "cpuid level     : 13\n",
    "wp              : yes\n",
    "flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single retpoline kaiser fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f rdseed adx smap clflushopt clwb avx512cd xsaveopt xsavec xgetbv1 ida arat\n",
    "bugs            : cpu_meltdown spectre_v1 spectre_v2\n",
    "bogomips        : 6000.00\n",
    "clflush size    : 64\n",
    "cache_alignment : 64\n",
    "address sizes   : 46 bits physical, 48 bits virtual\n",
    "power management:\n",
    "\n",
    "processor       : 1\n",
    "vendor_id       : GenuineIntel\n",
    "cpu family      : 6\n",
    "model           : 85\n",
    "model name      : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz\n",
    "stepping        : 3\n",
    "microcode       : 0x100013e\n",
    "cpu MHz         : 3000.002\n",
    "cache size      : 25344 KB\n",
    "physical id     : 0\n",
    "siblings        : 2\n",
    "core id         : 0\n",
    "cpu cores       : 1\n",
    "apicid          : 1\n",
    "initial apicid  : 1\n",
    "fpu             : yes\n",
    "fpu_exception   : yes\n",
    "cpuid level     : 13\n",
    "wp              : yes\n",
    "flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single retpoline kaiser fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f rdseed adx smap clflushopt clwb avx512cd xsaveopt xsavec xgetbv1 ida arat\n",
    "bugs            : cpu_meltdown spectre_v1 spectre_v2\n",
    "bogomips        : 6000.00\n",
    "clflush size    : 64\n",
    "cache_alignment : 64\n",
    "address sizes   : 46 bits physical, 48 bits virtual\n",
    "power management:\n",
    "```"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"%load_ext Cython"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%cython --cplus -c-std=c++11 -c-O3 -I/home/ubuntu/xsimd/include -c-march=native\n",
	"\n",
	"from libc.stdint cimport int32_t\n",
	"\n",
	"cdef extern from \"xsimd/types/xsimd_avx512_int32.hpp\":\n",
	" pass\n",
	"\n",
	"cdef extern from \"xsimd/xsimd.hpp\":\n",
	" \"\"\"\n",
	" void inplace_cumsum_int32(int32_t* data, size_t numitems) {\n",
	" xsimd::batch<int32_t, 16> step1({0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});\n",
	" unsigned int mask1 = 65534;\n",
	"\n",
	" xsimd::batch<int32_t, 16> step2({0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13});\n",
	" unsigned int mask2 = 65532;\n",
	"\n",
	" xsimd::batch<int32_t, 16> step3({0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});\n",
	" unsigned int mask3 = 65520;\n",
	"\n",
	" xsimd::batch<int32_t, 16> step4({0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7});\n",
	" unsigned int mask4 = 65280;\n",
	"\n",
	" xsimd::batch<int32_t, 16> scatter({15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15});\n",
	" \n",
	" xsimd::batch<int32_t, 16> v;\n",
	" xsimd::batch<int32_t, 16> carry({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});\n",
	" \n",
	" for (size_t i = 0; i < numitems; i += 16) {\n",
	" v.load_unaligned(&data[i]);\n",
	" \n",
	" v += _mm512_maskz_permutexvar_epi32(mask1, step1, v);\n",
	" v += _mm512_maskz_permutexvar_epi32(mask2, step2, v);\n",
	" v += _mm512_maskz_permutexvar_epi32(mask3, step3, v);\n",
	" v += _mm512_maskz_permutexvar_epi32(mask4, step4, v);\n",
	" v += carry;\n",
	" \n",
	" carry = _mm512_permutexvar_epi32(scatter, v);\n",
	" \n",
	" v.store_unaligned(&data[i]);\n",
	" }\n",
	" }\n",
	" \"\"\"\n",
	" void inplace_cumsum_int32(int32_t* data, size_t numitems)\n",
	"\n",
	"import numpy\n",
	"\n",
	"def cumsum(inarray, outarray=None):\n",
	" if not isinstance(inarray, numpy.ndarray) or not inarray.dtype == numpy.dtype(numpy.int32) or len(inarray.shape) != 1:\n",
	" raise TypeError(\"inarray must be a one-dimensional array of type int32\")\n",
	" if outarray is None:\n",
	" outarray = numpy.empty(len(inarray), dtype=inarray.dtype)\n",
	" if not isinstance(outarray, numpy.ndarray) or not outarray.dtype == numpy.dtype(numpy.int32) or len(outarray.shape) != 1:\n",
	" raise TypeError(\"outarray must be a one-dimensional array of type int32\")\n",
	" if inarray is not outarray:\n",
	" outarray[:] = inarray\n",
	" inplace_cumsum_int32(<int32_t*>(<size_t>outarray.ctypes.data), <size_t>len(outarray))\n",
	" return outarray"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Numpy 0.7051286697387695\n",
	"vectorized 0.06996774673461914\n",
	"\n",
	"Numpy 0.6951799392700195\n",
	"vectorized 0.06896781921386719\n",
	"\n",
	"Numpy 0.6997087001800537\n",
	"vectorized 0.06942558288574219\n",
	"\n",
	"Numpy 0.7024240493774414\n",
	"vectorized 0.06833982467651367\n",
	"\n",
	"Numpy 0.6963632106781006\n",
	"vectorized 0.06911921501159668\n",
	"\n",
	"Numpy 0.6976268291473389\n",
	"vectorized 0.0697774887084961\n",
	"\n",
	"Numpy 0.6992297172546387\n",
	"vectorized 0.06891417503356934\n",
	"\n",
	"Numpy 0.700272798538208\n",
	"vectorized 0.06902408599853516\n",
	"\n",
	"Numpy 0.7043983936309814\n",
	"vectorized 0.06905174255371094\n",
	"\n",
	"Numpy 0.6968092918395996\n",
	"vectorized 0.06934857368469238\n",
	"\n"
	]
	}
	],
	"source": [
	"import time\n",
	"\n",
	"a = numpy.random.randint(-100, 100, 160000000, dtype=numpy.int32)\n",
	"\n",
	"for i in range(10):\n",
	" startTime = time.time()\n",
	" numpy.cumsum(a, out=a)\n",
	" endTime = time.time()\n",
	" print(\"Numpy \", endTime - startTime)\n",
	"\n",
	" startTime = time.time()\n",
	" cumsum(a, a)\n",
	" endTime = time.time()\n",
	" print(\"vectorized\", endTime - startTime)\n",
	" print()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Tested on a AWS c3.large, which has AVX-512 instructions (Skylake). Standard conda installation of Numpy.\n",
	"\n",
	"\n",
	"```\n",
	"cat /proc/cpuinfo\n",
	"processor : 0\n",
	"vendor_id : GenuineIntel\n",
	"cpu family : 6\n",
	"model : 85\n",
	"model name : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz\n",
	"stepping : 3\n",
	"microcode : 0x100013e\n",
	"cpu MHz : 3000.002\n",
	"cache size : 25344 KB\n",
	"physical id : 0\n",
	"siblings : 2\n",
	"core id : 0\n",
	"cpu cores : 1\n",
	"apicid : 0\n",
	"initial apicid : 0\n",
	"fpu : yes\n",
	"fpu_exception : yes\n",
	"cpuid level : 13\n",
	"wp : yes\n",
	"flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single retpoline kaiser fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f rdseed adx smap clflushopt clwb avx512cd xsaveopt xsavec xgetbv1 ida arat\n",
	"bugs : cpu_meltdown spectre_v1 spectre_v2\n",
	"bogomips : 6000.00\n",
	"clflush size : 64\n",
	"cache_alignment : 64\n",
	"address sizes : 46 bits physical, 48 bits virtual\n",
	"power management:\n",
	"\n",
	"processor : 1\n",
	"vendor_id : GenuineIntel\n",
	"cpu family : 6\n",
	"model : 85\n",
	"model name : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz\n",
	"stepping : 3\n",
	"microcode : 0x100013e\n",
	"cpu MHz : 3000.002\n",
	"cache size : 25344 KB\n",
	"physical id : 0\n",
	"siblings : 2\n",
	"core id : 0\n",
	"cpu cores : 1\n",
	"apicid : 1\n",
	"initial apicid : 1\n",
	"fpu : yes\n",
	"fpu_exception : yes\n",
	"cpuid level : 13\n",
	"wp : yes\n",
	"flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single retpoline kaiser fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f rdseed adx smap clflushopt clwb avx512cd xsaveopt xsavec xgetbv1 ida arat\n",
	"bugs : cpu_meltdown spectre_v1 spectre_v2\n",
	"bogomips : 6000.00\n",
	"clflush size : 64\n",
	"cache_alignment : 64\n",
	"address sizes : 46 bits physical, 48 bits virtual\n",
	"power management:\n",
	"```"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}