dajuno/vec_dot.cpp

## vec_dot.cpp
/* compile with:
 g++ -O3 -std=c++11 vec_dot.cpp -I/usr/include/openblas -fopenmp -lcblas

 (or clang++)

NOTE:
  - in main, comment either the OpenMP or the BLAS block! using both leads to degraded performance.
  - on my TP X13 G3 with Intel i7-1260P, adding -march=native will lead to lower performance!!
  - other flags like -funroll-loops -flto -finline-functions did not seem to lead to significant performance increases
*/
#include <iostream>
#include <vector>
#include <random>
#include <chrono>
#include <cblas.h>
#include <omp.h>


double dot(std::vector<double> const &a, std::vector<double> const &b)
{
    double sum = 0.0;
    for(int i = 0; i < a.size(); ++i) {
        sum += a[i] * b[i];
    }
    return sum;
}

double dot_omp(std::vector<double> const &a, std::vector<double> const &b)
{
    double sum = 0.0;
    int target_thread_num = 16;
    omp_set_num_threads(target_thread_num);

    #pragma omp parallel for reduction(+:sum)
    for(int i = 0; i < a.size(); ++i) {
        sum += a[i] * b[i];
    }
    return sum;
}


double dot_blas(std::vector<double> const &a, std::vector<double> const &b)
{
    return cblas_ddot(a.size(), &a[0], 1, &b[0], 1);
}


std::vector<double> create_random_vector(int n) {
    std::random_device rd;  // Will be used to obtain a seed for the random number engine
    std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
    std::uniform_real_distribution<> dis(0.0, 1.0);
    std::vector<double> vec(n, 0.0);

    for (int i = 0; i < n; ++i) {
        vec[i] = dis(gen);
    }

    return vec;
    }

int main() {
    int n = 100000000;

    std::vector<double> a = create_random_vector(n);
    std::vector<double> b = create_random_vector(n);

    std::cout << "Vector size: " << a.size() << std::endl;

    std::cout << "C++" << std::endl;
    auto t_start = std::chrono::steady_clock::now();
    std::cout << "\tdot: " << dot(a, b) << std::endl;
    auto t_stop = std::chrono::steady_clock::now();
    auto t_elapsed = t_stop - t_start;
    std::cout << "\ttime elapsed: " << std::chrono::duration<double, std::milli>(t_elapsed).count() << " ms" << std::endl;

    // OMP
    /*
    std::cout << "C++ w/ OpenMP" << std::endl;
    t_start = std::chrono::steady_clock::now();
    std::cout << "\tdot: " << dot_omp(a, b) << std::endl;
    t_stop = std::chrono::steady_clock::now();
    t_elapsed = t_stop - t_start;
    std::cout << "\ttime elapsed: " << std::chrono::duration<double, std::milli>(t_elapsed).count() << " ms" << std::endl;
    //*/

    // OpenBLAS
    //*
    std::cout << "C++ w/ OpenBLAS" << std::endl;
    std::cout << "\tOpenBLAS threads: " << openblas_get_num_threads() << std::endl;
    t_start = std::chrono::steady_clock::now();
    std::cout << "\tdot: " << dot_blas(a, b) << std::endl;
    t_stop = std::chrono::steady_clock::now();
    t_elapsed = t_stop - t_start;
    std::cout << "\ttime elapsed: " << std::chrono::duration<double, std::milli>(t_elapsed).count() << " ms" << std::endl;
    //*/


    return 0;
}

## vec_dot.py
import time

import numpy
from numba import njit, prange


def dot_pure(a, b):
    sum = 0.0
    for x, y in zip(a, b):
        sum += x * y
    return sum


@njit
def dot_numba(a, b):
    sum = 0.0
    for x, y in zip(a, b):
        sum += x * y
    return sum


@njit(parallel=True)
def dot_numba_parallel(a, b):
    sum = 0.0
    for i in prange(len(a)):
        sum += a[i] * b[i]
    return sum


def dot_numpy(a, b):
    return numpy.dot(a, b)


def create_random_vector(n):
    return numpy.random.rand(n).astype(numpy.float64)


if __name__ == "__main__":
    n = 100000000
    a = create_random_vector(n)
    b = create_random_vector(n)

    print("Vector size: ", a.size)

    print("Python")
    t_start = time.perf_counter_ns()
    print("\tdot: ", dot_pure(a, b))
    t_stop = time.perf_counter_ns()
    print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms")

    print("NumPy")
    t_start = time.perf_counter_ns()
    print("\tdot: ", dot_numpy(a, b))
    t_stop = time.perf_counter_ns()
    print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms")

    print("Python+numba")
    # call once to exclude compile time in timings
    dot_numba(a, b)
    t_start = time.perf_counter_ns()
    print("\tdot: ", dot_numba(a, b))
    t_stop = time.perf_counter_ns()
    print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms")

    print("Python+numba parallel")
    # call once to exclude compile time in timings
    dot_numba_parallel(a, b)
    t_start = time.perf_counter_ns()
    print("\tdot: ", dot_numba_parallel(a, b))
    t_stop = time.perf_counter_ns()
    print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms")
	/* compile with:
	g++ -O3 -std=c++11 vec_dot.cpp -I/usr/include/openblas -fopenmp -lcblas

	(or clang++)

	NOTE:
	- in main, comment either the OpenMP or the BLAS block! using both leads to degraded performance.
	- on my TP X13 G3 with Intel i7-1260P, adding -march=native will lead to lower performance!!
	- other flags like -funroll-loops -flto -finline-functions did not seem to lead to significant performance increases
	*/
	#include <iostream>
	#include <vector>
	#include <random>
	#include <chrono>
	#include <cblas.h>
	#include <omp.h>


	double dot(std::vector<double> const &a, std::vector<double> const &b)
	{
	double sum = 0.0;
	for(int i = 0; i < a.size(); ++i) {
	sum += a[i] * b[i];
	}
	return sum;
	}

	double dot_omp(std::vector<double> const &a, std::vector<double> const &b)
	{
	double sum = 0.0;
	int target_thread_num = 16;
	omp_set_num_threads(target_thread_num);

	#pragma omp parallel for reduction(+:sum)
	for(int i = 0; i < a.size(); ++i) {
	sum += a[i] * b[i];
	}
	return sum;
	}


	double dot_blas(std::vector<double> const &a, std::vector<double> const &b)
	{
	return cblas_ddot(a.size(), &a[0], 1, &b[0], 1);
	}


	std::vector<double> create_random_vector(int n) {
	std::random_device rd; // Will be used to obtain a seed for the random number engine
	std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()
	std::uniform_real_distribution<> dis(0.0, 1.0);
	std::vector<double> vec(n, 0.0);

	for (int i = 0; i < n; ++i) {
	vec[i] = dis(gen);
	}

	return vec;
	}

	int main() {
	int n = 100000000;

	std::vector<double> a = create_random_vector(n);
	std::vector<double> b = create_random_vector(n);

	std::cout << "Vector size: " << a.size() << std::endl;

	std::cout << "C++" << std::endl;
	auto t_start = std::chrono::steady_clock::now();
	std::cout << "\tdot: " << dot(a, b) << std::endl;
	auto t_stop = std::chrono::steady_clock::now();
	auto t_elapsed = t_stop - t_start;
	std::cout << "\ttime elapsed: " << std::chrono::duration<double, std::milli>(t_elapsed).count() << " ms" << std::endl;

	// OMP
	/*
	std::cout << "C++ w/ OpenMP" << std::endl;
	t_start = std::chrono::steady_clock::now();
	std::cout << "\tdot: " << dot_omp(a, b) << std::endl;
	t_stop = std::chrono::steady_clock::now();
	t_elapsed = t_stop - t_start;
	std::cout << "\ttime elapsed: " << std::chrono::duration<double, std::milli>(t_elapsed).count() << " ms" << std::endl;
	//*/

	// OpenBLAS
	//*
	std::cout << "C++ w/ OpenBLAS" << std::endl;
	std::cout << "\tOpenBLAS threads: " << openblas_get_num_threads() << std::endl;
	t_start = std::chrono::steady_clock::now();
	std::cout << "\tdot: " << dot_blas(a, b) << std::endl;
	t_stop = std::chrono::steady_clock::now();
	t_elapsed = t_stop - t_start;
	std::cout << "\ttime elapsed: " << std::chrono::duration<double, std::milli>(t_elapsed).count() << " ms" << std::endl;
	//*/


	return 0;
	}
	import time

	import numpy
	from numba import njit, prange


	def dot_pure(a, b):
	sum = 0.0
	for x, y in zip(a, b):
	sum += x * y
	return sum


	@njit
	def dot_numba(a, b):
	sum = 0.0
	for x, y in zip(a, b):
	sum += x * y
	return sum


	@njit(parallel=True)
	def dot_numba_parallel(a, b):
	sum = 0.0
	for i in prange(len(a)):
	sum += a[i] * b[i]
	return sum


	def dot_numpy(a, b):
	return numpy.dot(a, b)


	def create_random_vector(n):
	return numpy.random.rand(n).astype(numpy.float64)


	if __name__ == "__main__":
	n = 100000000
	a = create_random_vector(n)
	b = create_random_vector(n)

	print("Vector size: ", a.size)

	print("Python")
	t_start = time.perf_counter_ns()
	print("\tdot: ", dot_pure(a, b))
	t_stop = time.perf_counter_ns()
	print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms")

	print("NumPy")
	t_start = time.perf_counter_ns()
	print("\tdot: ", dot_numpy(a, b))
	t_stop = time.perf_counter_ns()
	print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms")

	print("Python+numba")
	# call once to exclude compile time in timings
	dot_numba(a, b)
	t_start = time.perf_counter_ns()
	print("\tdot: ", dot_numba(a, b))
	t_stop = time.perf_counter_ns()
	print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms")

	print("Python+numba parallel")
	# call once to exclude compile time in timings
	dot_numba_parallel(a, b)
	t_start = time.perf_counter_ns()
	print("\tdot: ", dot_numba_parallel(a, b))
	t_stop = time.perf_counter_ns()
	print(f"\ttime elapsed: {(t_stop - t_start) * 1e-6} ms")
Method	Time (ms)
Python	8273
Python (NumPy, BLAS using 16 cores)	27
Python (NumPy, using 1 core)	50
Python (Numba JIT)	66
Python (Numba JIT & parallel)	24
C++ (no opt flags)	252
C++ (`-O3`)	63
C++ (`-O3` with BLAS, using all 16 cores)	30
C++ (`-O3` with BLAS, using 1 core)	50
C++ (`-O3` with OpenMPI parallel loop execution on 16 cores)	30