brandonwillard/both-opt-max-reduce-axis-1.pdf

## both-opt-max-reduce-axis-1.pdf

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              both-opt-max-reduce-axis-1.pdf
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## both-unopt-max-reduce-axis-1.pdf

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              both-unopt-max-reduce-axis-1.pdf
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## clang-comparison.md

      
    Raw
  

              clang-comparison.md
            
          
    #include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

const int NROWS = 5000;
const int NCOLS = 6000;


inline double custom_max(double x, double y) {
  if (x < y) {
    return y;
  } else {
    return x;
  }
}

double * max_reduce_axis_1(double ** x) {

  double * res = (double *) malloc(NROWS * sizeof(double));
  for (int i=0; i < NROWS; i++)
    res[i] = -INFINITY;

  if (res == NULL)
    exit(1);

  for (int i=0; i < NROWS; i++) {
    for (int j=0; j < NCOLS; j++) {
      res[i] = custom_max(res[i], x[i][j]);
    }
  }

  return res;
}

inline void custom_max_inplace(double *res, int i, double y) {
  if (res[i] < y)
    res[i] = y;
}

double * max_reduce_axis_1_inplace(double ** x) {

  double * res = (double *) malloc(NROWS * sizeof(double));
  for (int i=0; i < NROWS; i++)
    res[i] = -INFINITY;

  if (res == NULL)
    exit(1);

  for (int i=0; i < NROWS; i++) {
    for (int j=0; j < NCOLS; j++) {
      custom_max_inplace(res, i, x[i][j]);
    }
  }

  return res;
}

double * max_reduce_axis_1_nocall(double ** x) {

  double * res = (double *) malloc(NROWS * sizeof(double));
  for (int i=0; i < NROWS; i++)
    res[i] = -INFINITY;

  if (res == NULL)
    exit(1);

  for (int i=0; i < NROWS; i++) {
    for (int j=0; j < NCOLS; j++) {
      double tmp = x[i][j];
      if (res[i] < tmp)
        res[i] = tmp;
    }
  }

  return res;
}

void simulate_data(double ** x) {
  for(int i = 0; i < NROWS; i++) {
    for(int j = 0; j < NCOLS; j++) {
      float val = (float)(rand() % 10);
      x[i][j] = val;
    }
  }
}


int main() {

  srand(time(NULL));

  double ** x;

  x = malloc(NROWS * sizeof(double *));

  if (x == NULL)
    exit(1);

  for(int i = 0; i < NROWS; i++) {
    x[i] = malloc(NCOLS * sizeof(double));

    if (x == NULL)
      exit(1);
  }

  printf("call\tcall-inplace\tnocall");
  for (int i = 0; i < 10; i++) {
    simulate_data(x);

    clock_t begin = clock();
    double *res_1 = max_reduce_axis_1(x);
    clock_t end = clock();
    double call_time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    begin = clock();
    double *res_2 = max_reduce_axis_1_inplace(x);
    end = clock();
    double call_inplace_time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    begin = clock();
    double *res_3 = max_reduce_axis_1_nocall(x);
    end = clock();
    double no_call_time_spent = (double)(end - begin) / CLOCKS_PER_SEC;

    printf("\n%f\t%f\t%f", call_time_spent, call_inplace_time_spent, no_call_time_spent);

    for(int i = 0; i < NROWS; i++) {
      if (res_1[i] != res_2[i] || res_1[i] != res_3[i])
        perror("results not equal");
    }
    free((void *) res_1);
    free((void *) res_2);
    free((void *) res_3);
  }


  // Clean up
  for(int i = 0; i < NROWS; i++) {
    free((void *) x[i]);
  }
  free((void *) x);
}
clang -v -O3 -g max_reduce_axis.c -o max_reduce_axis_test
clang version 10.0.0-4ubuntu1
Target: x86_64-pc-linux-gnu
Thread model: posix
InstalledDir: /usr/bin
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/10
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/7
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/7.5.0
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/8
Found candidate GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/9
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/10
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/7.5.0
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/8
Found candidate GCC installation: /usr/lib/gcc/x86_64-linux-gnu/9
Selected GCC installation: /usr/bin/../lib/gcc/x86_64-linux-gnu/10
Candidate multilib: .;@m64
Selected multilib: .;@m64
 "/usr/lib/llvm-10/bin/clang" -cc1 -triple x86_64-pc-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name max_reduce_axis.c -mrelocation-model static -mthread-model posix -mframe-pointer=none -fmath-errno -fno-rounding-math -masm-verbose -mconstructor-aliases -munwind-tables -target-cpu x86-64 -dwarf-column-info -fno-split-dwarf-inlining -debug-info-kind=limited -dwarf-version=4 -debugger-tuning=gdb -v -resource-dir /usr/lib/llvm-10/lib/clang/10.0.0 -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-10/lib/clang/10.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdebug-compilation-dir /tmp -ferror-limit 19 -fmessage-length 231 -fgnuc-version=4.2.1 -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -faddrsig -o /tmp/user/1000/max_reduce_axis-4a51bb.o -x c max_reduce_axis.c
clang -cc1 version 10.0.0 based upon LLVM 10.0.0 default target x86_64-pc-linux-gnu
ignoring nonexistent directory "/include"
include "..." search starts here:
include <...> search starts here:
 /usr/local/include
 /usr/lib/llvm-10/lib/clang/10.0.0/include
 /usr/include/x86_64-linux-gnu
 /usr/include
End of search list.
 "/usr/bin/ld" -z relro --hash-style=gnu --build-id --eh-frame-hdr -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o max_reduce_axis_test /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu/crt1.o /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu/crti.o /usr/bin/../lib/gcc/x86_64-linux-gnu/10/crtbegin.o -L/usr/bin/../lib/gcc/x86_64-linux-gnu/10 -L/usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu -L/lib/x86_64-linux-gnu -L/lib/../lib64 -L/usr/lib/x86_64-linux-gnu -L/usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../.. -L/usr/lib/llvm-10/bin/../lib -L/lib -L/usr/lib /tmp/user/1000/max_reduce_axis-4a51bb.o -lgcc --as-needed -lgcc_s --no-as-needed -lc -lgcc --as-needed -lgcc_s --no-as-needed /usr/bin/../lib/gcc/x86_64-linux-gnu/10/crtend.o /usr/bin/../lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu/crtn.o
./max_reduce_axis_test
call	call-inplace	nocall
0.033317	0.019050	0.019931
0.033618	0.019099	0.019679
0.032939	0.018802	0.019675
0.032866	0.018517	0.019641
0.032872	0.018449	0.019533
0.032871	0.018405	0.019492
0.032842	0.018149	0.019479
0.032894	0.018311	0.019516
0.033016	0.018266	0.019554

  
## numba-careduce-mwe.py
from contextlib import contextmanager

import numba
import numpy as np


X = np.random.normal(size=(5000, 6000))

#
# This is what we're trying to implement in Numba:
#
numpy_res = np.max(X, axis=1)

# This is what we're trying to match/beat:
# %timeit np.max(X, axis=1)
# 15.6 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

#
# NOTE: We do *not* want to enable `parallel=True`, `fastmath=True`, or any
# other options that are neither generalizable nor used by NumPy.
#

@numba.njit(inline="always")
def custom_max(x, y):
    if x > y:
        return x
    else:
        return y


#
# Case 1: Using `vectorize` along the reduced dimension
#
@numba.vectorize(["float64(float64, float64)"])
def vectorized_max(x, y):
    return custom_max(x, y)


@numba.njit
def vectorized_max_reduce_axis_1(x):
    res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
    x_transpose = np.transpose(x)
    for m in range(x.shape[1]):
        vectorized_max(res, x_transpose[m], res)

    return res


# Confirm that it works
assert np.array_equal(numpy_res, vectorized_max_reduce_axis_1(X))

# %timeit vectorized_max_reduce_axis_1(X)
# 94.2 ms ± 435 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

#
# Case 2: Manually written Numba reduction loops
#
@numba.njit
def manual_max_reduce_axis_1(x):
    res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            res[i] = custom_max(res[i], x[i, j])
    return res


assert np.array_equal(numpy_res, manual_max_reduce_axis_1(X))

# %timeit manual_max_reduce_axis_1(X)
# 38.3 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


#
# Case 3: Calls to NumPy's `ufunc.reduce` via `objmode`
#
@numba.njit
def objmode_max_reduce_axis_1(x):
    with numba.objmode(res="float64[:]"):
        res = vectorized_max.reduce(x, axis=1)
    return res


assert np.array_equal(numpy_res, objmode_max_reduce_axis_1(X))

# %timeit objmode_max_reduce_axis_1(X)
# 73.7 ms ± 262 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#
# Case 4: Recompile Case 2 with more LLVM optimizations in the "cheap" pass
#
# This comes from the discussions in
# https://numba.discourse.group/t/numba-performance-doesnt-scale-as-well-as-numpy-in-vectorized-max-function/782
#
@contextmanager
def use_optimized_cheap_pass(*args, **kwargs):
    """Temporarily replace the cheap optimization pass with a better one."""
    from numba.core.registry import cpu_target

    context = cpu_target.target_context._internal_codegen
    old_pm = context._mpm_cheap
    new_pm = context._module_pass_manager(
        loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
    )
    context._mpm_cheap = new_pm
    try:
        yield
    finally:
        context._mpm_cheap = old_pm


with use_optimized_cheap_pass():

    @numba.njit
    def opt_manual_max_reduce_axis_1(x):
        res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
        for i in range(x.shape[0]):
            for j in range(x.shape[1]):
                res[i] = custom_max(res[i], x[i, j])
        return res

    assert np.array_equal(numpy_res, opt_manual_max_reduce_axis_1(X))

# %timeit opt_manual_max_reduce_axis_1(X)
# 37.6 ms ± 872 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#
# Case 5: This is Stuart's original form of the reduction
#
# Apparently, removing the use of `custom_max` and manually "in-lining" the
# `max` operation makes a large difference; however, we can't reasonably do
# this for every binary function that will perform a reduction.
#
with use_optimized_cheap_pass():

    @numba.njit
    def orig_opt_max_reduce_axis_1(x):
        res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
        for i in range(x.shape[0]):
            for j in range(x.shape[1]):
                tmp = x[i, j]
                if res[i] < tmp:
                    res[i] = tmp
        return res

    assert np.array_equal(numpy_res, orig_opt_max_reduce_axis_1(X))

# %timeit orig_opt_max_reduce_axis_1(X)
# 20.3 ms ± 233 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

#
# Case 5: Use an in-place version of `custom_max`
#
with use_optimized_cheap_pass():

    @numba.njit
    def custom_max(res, i, y):
        if res[i] < y:
            res[i] = y

    @numba.njit
    def inplace_opt_max_reduce_axis_1(x):
        res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
        for i in range(x.shape[0]):
            for j in range(x.shape[1]):
                custom_max(res, i, x[i, j])
        return res

    assert np.array_equal(numpy_res, inplace_opt_max_reduce_axis_1(X))

# %timeit inplace_opt_max_reduce_axis_1(X)
# 19.5 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## produce-cfg-plots.py
from contextlib import contextmanager, suppress

import numba
import numpy as np


numba.config.DEBUGINFO_DEFAULT = 1

X = np.random.normal(size=(5000, 6000))

numpy_res = np.max(X, axis=1)


@contextmanager
def use_optimized_cheap_pass(*args, **kwargs):
    """Temporarily replace the cheap optimization pass with a better one."""
    from numba.core.registry import cpu_target

    context = cpu_target.target_context._internal_codegen
    old_pm = context._mpm_cheap
    new_pm = context._module_pass_manager(
        loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
    )
    context._mpm_cheap = new_pm
    try:
        yield
    finally:
        context._mpm_cheap = old_pm


for opt_type in ["unopt", "opt"]:

    cm = suppress if opt_type == "unopt" else use_optimized_cheap_pass
    numba.config.OPT = 0 if opt_type == "unopt" else 3

    @numba.njit(inline="always")
    def custom_max(x, y):
        if x > y:
            return x
        else:
            return y

    with cm():

        @numba.njit
        def slow_max_reduce_axis_1(x):
            res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
            for i in range(x.shape[0]):
                for j in range(x.shape[1]):
                    res[i] = custom_max(res[i], x[i, j])
            return res

        assert np.array_equal(numpy_res, slow_max_reduce_axis_1(X))

    res = slow_max_reduce_axis_1.inspect_cfg(
        slow_max_reduce_axis_1.signatures[0],
        strip_ir=False,
        interleave=True,
        fontsize=10,
    )
    _ = res.display(filename=f"{opt_type}-slow-max-reduce-axis-1", format="pdf")

    with cm():

        @numba.njit
        def fast_max_reduce_axis_1(x):
            res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
            for i in range(x.shape[0]):
                for j in range(x.shape[1]):
                    tmp = x[i, j]
                    if res[i] < tmp:
                        res[i] = tmp
            return res

        assert np.array_equal(numpy_res, fast_max_reduce_axis_1(X))

    res = fast_max_reduce_axis_1.inspect_cfg(
        fast_max_reduce_axis_1.signatures[0],
        strip_ir=False,
        interleave=True,
        fontsize=10,
    )
    _ = res.display(filename=f"{opt_type}-fast-max-reduce-axis-1", format="pdf")
	from contextlib import contextmanager

	import numba
	import numpy as np


	X = np.random.normal(size=(5000, 6000))

	#
	# This is what we're trying to implement in Numba:
	#
	numpy_res = np.max(X, axis=1)

	# This is what we're trying to match/beat:
	# %timeit np.max(X, axis=1)
	# 15.6 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

	#
	# NOTE: We do not want to enable `parallel=True`, `fastmath=True`, or any
	# other options that are neither generalizable nor used by NumPy.
	#

	@numba.njit(inline="always")
	def custom_max(x, y):
	if x > y:
	return x
	else:
	return y


	#
	# Case 1: Using `vectorize` along the reduced dimension
	#
	@numba.vectorize(["float64(float64, float64)"])
	def vectorized_max(x, y):
	return custom_max(x, y)


	@numba.njit
	def vectorized_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	x_transpose = np.transpose(x)
	for m in range(x.shape[1]):
	vectorized_max(res, x_transpose[m], res)

	return res


	# Confirm that it works
	assert np.array_equal(numpy_res, vectorized_max_reduce_axis_1(X))

	# %timeit vectorized_max_reduce_axis_1(X)
	# 94.2 ms ± 435 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

	#
	# Case 2: Manually written Numba reduction loops
	#
	@numba.njit
	def manual_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	res[i] = custom_max(res[i], x[i, j])
	return res


	assert np.array_equal(numpy_res, manual_max_reduce_axis_1(X))

	# %timeit manual_max_reduce_axis_1(X)
	# 38.3 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


	#
	# Case 3: Calls to NumPy's `ufunc.reduce` via `objmode`
	#
	@numba.njit
	def objmode_max_reduce_axis_1(x):
	with numba.objmode(res="float64[:]"):
	res = vectorized_max.reduce(x, axis=1)
	return res


	assert np.array_equal(numpy_res, objmode_max_reduce_axis_1(X))

	# %timeit objmode_max_reduce_axis_1(X)
	# 73.7 ms ± 262 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


	#
	# Case 4: Recompile Case 2 with more LLVM optimizations in the "cheap" pass
	#
	# This comes from the discussions in
	# https://numba.discourse.group/t/numba-performance-doesnt-scale-as-well-as-numpy-in-vectorized-max-function/782
	#
	@contextmanager
	def use_optimized_cheap_pass(args, *kwargs):
	"""Temporarily replace the cheap optimization pass with a better one."""
	from numba.core.registry import cpu_target

	context = cpu_target.target_context._internal_codegen
	old_pm = context._mpm_cheap
	new_pm = context._module_pass_manager(
	loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
	)
	context._mpm_cheap = new_pm
	try:
	yield
	finally:
	context._mpm_cheap = old_pm


	with use_optimized_cheap_pass():

	@numba.njit
	def opt_manual_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	res[i] = custom_max(res[i], x[i, j])
	return res

	assert np.array_equal(numpy_res, opt_manual_max_reduce_axis_1(X))

	# %timeit opt_manual_max_reduce_axis_1(X)
	# 37.6 ms ± 872 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


	#
	# Case 5: This is Stuart's original form of the reduction
	#
	# Apparently, removing the use of `custom_max` and manually "in-lining" the
	# `max` operation makes a large difference; however, we can't reasonably do
	# this for every binary function that will perform a reduction.
	#
	with use_optimized_cheap_pass():

	@numba.njit
	def orig_opt_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	tmp = x[i, j]
	if res[i] < tmp:
	res[i] = tmp
	return res

	assert np.array_equal(numpy_res, orig_opt_max_reduce_axis_1(X))

	# %timeit orig_opt_max_reduce_axis_1(X)
	# 20.3 ms ± 233 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

	#
	# Case 5: Use an in-place version of `custom_max`
	#
	with use_optimized_cheap_pass():

	@numba.njit
	def custom_max(res, i, y):
	if res[i] < y:
	res[i] = y

	@numba.njit
	def inplace_opt_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	custom_max(res, i, x[i, j])
	return res

	assert np.array_equal(numpy_res, inplace_opt_max_reduce_axis_1(X))

	# %timeit inplace_opt_max_reduce_axis_1(X)
	# 19.5 ms ± 144 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
	from contextlib import contextmanager, suppress

	import numba
	import numpy as np


	numba.config.DEBUGINFO_DEFAULT = 1

	X = np.random.normal(size=(5000, 6000))

	numpy_res = np.max(X, axis=1)


	@contextmanager
	def use_optimized_cheap_pass(args, *kwargs):
	"""Temporarily replace the cheap optimization pass with a better one."""
	from numba.core.registry import cpu_target

	context = cpu_target.target_context._internal_codegen
	old_pm = context._mpm_cheap
	new_pm = context._module_pass_manager(
	loop_vectorize=True, slp_vectorize=True, opt=3, cost="cheap"
	)
	context._mpm_cheap = new_pm
	try:
	yield
	finally:
	context._mpm_cheap = old_pm


	for opt_type in ["unopt", "opt"]:

	cm = suppress if opt_type == "unopt" else use_optimized_cheap_pass
	numba.config.OPT = 0 if opt_type == "unopt" else 3

	@numba.njit(inline="always")
	def custom_max(x, y):
	if x > y:
	return x
	else:
	return y

	with cm():

	@numba.njit
	def slow_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	res[i] = custom_max(res[i], x[i, j])
	return res

	assert np.array_equal(numpy_res, slow_max_reduce_axis_1(X))

	res = slow_max_reduce_axis_1.inspect_cfg(
	slow_max_reduce_axis_1.signatures[0],
	strip_ir=False,
	interleave=True,
	fontsize=10,
	)
	_ = res.display(filename=f"{opt_type}-slow-max-reduce-axis-1", format="pdf")

	with cm():

	@numba.njit
	def fast_max_reduce_axis_1(x):
	res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
	for i in range(x.shape[0]):
	for j in range(x.shape[1]):
	tmp = x[i, j]
	if res[i] < tmp:
	res[i] = tmp
	return res

	assert np.array_equal(numpy_res, fast_max_reduce_axis_1(X))

	res = fast_max_reduce_axis_1.inspect_cfg(
	fast_max_reduce_axis_1.signatures[0],
	strip_ir=False,
	interleave=True,
	fontsize=10,
	)
	_ = res.display(filename=f"{opt_type}-fast-max-reduce-axis-1", format="pdf")