migueldiascosta/gist:cf4697a2e98bbe345a39ee009e0efb4c

## gistfile1.txt
## comparing SSL2 performance with OpenBLAS using np.dot linked with them

$ module use /projects/easybuild/tests/a/modules/all

$ module load SciPy-bundle/2021.05-foss-2021a

$ ldd $EBROOTSCIPYMINBUNDLE/lib64/python3.9/site-packages/numpy/linalg/_umath_linalg.cpython-39-aarch64-linux-gnu.so | grep blas
	libflexiblas.so.3 => /share/apps-arm/ohpc/easybuild/software/rocky/8.5/a64fx/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib/libflexiblas.so.3 (0x00004000000d0000)

$ for nt in 1 2 4 8 24 48; do echo -n "nthreads=$nt; "; OMP_NUM_THREADS=$nt python -m timeit -n 3 -r 3 -s "import numpy; x = numpy.random.random((1000, 1000))" "numpy.dot(x, x.T)"; done
nthreads=1; 3 loops, best of 3: 314 msec per loop
nthreads=2; 3 loops, best of 3: 162 msec per loop
nthreads=4; 3 loops, best of 3: 84.5 msec per loop
nthreads=8; 3 loops, best of 3: 46.3 msec per loop
nthreads=24; 3 loops, best of 3: 20.6 msec per loop
nthreads=48; 3 loops, best of 3: 15.1 msec per loop
:0: UserWarning: The test results are likely unreliable. The worst time (230 msec) was more than four times slower than the best time (15.1 msec).

$ module purge; module load SciPy-bundle/2021.05-Fujitsu-21.05_openmp

$ ldd $EBROOTSCIPYMINBUNDLE/lib64/python3.9/site-packages/numpy/linalg/_umath_linalg.cpython-39-aarch64-linux-gnu.so | grep lapack
	libfjlapackexsve.so.1 => /opt/FJSVstclanga/cp-1.0.21.02a/lib64/libfjlapackexsve.so.1 (0x00004000000c0000)

$ for nt in 1 2 4 8 24 48; do echo -n "nthreads=$nt; "; OMP_NUM_THREADS=$nt python -m timeit -n 3 -r 3 -s "import numpy; x = numpy.random.random((1000, 1000))" "numpy.dot(x, x.T)"; done
nthreads=1; 3 loops, best of 3: 34.6 msec per loop
nthreads=2; 3 loops, best of 3: 21.1 msec per loop
nthreads=4; 3 loops, best of 3: 13.7 msec per loop
nthreads=8; 3 loops, best of 3: 9.98 msec per loop
nthreads=24; 3 loops, best of 3: 7.63 msec per loop
nthreads=48; 3 loops, best of 3: 7.55 msec per loop
:0: UserWarning: The test results are likely unreliable. The worst time (36.3 msec) was more than four times slower than the best time (7.55 msec).

### matrix is too small when using many threads, increasing the size

$ module purge; module load SciPy-bundle/2021.05-foss-2021a
$ for nt in 1 2 4 8 24 48; do echo -n "nthreads=$nt; "; OMP_NUM_THREADS=$nt python -m timeit -n 3 -r 3 -s "import numpy; x = numpy.random.random((5000, 5000))" "numpy.dot(x, x.T)"; done
nthreads=1; 3 loops, best of 3: 38.8 sec per loop
nthreads=2; 3 loops, best of 3: 19.8 sec per loop
nthreads=4; 3 loops, best of 3: 10.2 sec per loop
nthreads=8; 3 loops, best of 3: 5.42 sec per loop
nthreads=24; 3 loops, best of 3: 2.22 sec per loop
nthreads=48; 3 loops, best of 3: 1.5 sec per loop

$ module purge; module load SciPy-bundle/2021.05-Fujitsu-21.05_openmp
$ for nt in 1 2 4 8 24 48; do echo -n "nthreads=$nt; "; OMP_NUM_THREADS=$nt python -m timeit -n 3 -r 3 -s "import numpy; x = numpy.random.random((5000, 5000))" "numpy.dot(x, x.T)"; done
nthreads=1; 3 loops, best of 3: 3.13 sec per loop
nthreads=2; 3 loops, best of 3: 1.65 sec per loop
nthreads=4; 3 loops, best of 3: 898 msec per loop
nthreads=8; 3 loops, best of 3: 547 msec per loop
nthreads=24; 3 loops, best of 3: 317 msec per loop
nthreads=48; 3 loops, best of 3: 270 msec per loop

### on a64fx, SSL2 is ~10x faster than OpenBLAS (at least this version of OpenBLAS, 0.3.15)

## gistfile2.txt
Running motorBike OpenFOAM example with different (.org) versions and toolchains:
(obtained with ~/fujitsu/FCC-2022a_gcccore_largepage/run_openfoam_motorbike.script)
OpenFOAM/10-gompi-2022a: 470.816562465 seconds
OpenFOAM/10-ffmpi-2022a: 237.087816735 seconds (~2x faster with Fujitsu	)
OpenFOAM/11-gompi-2022a: 360.682005787 seconds
OpenFOAM/11-ffmpi-2022a: 40.391527638 seconds (hm, ~9x faster? or something wrong?)

Some .com versions (v2206 and v2312) are also installed with both gompi and ffmpi, but some utilities seem to be missing when built with ffmpi and even with gompi there seem to be issues running the motorBike example (TODO)
	## comparing SSL2 performance with OpenBLAS using np.dot linked with them

	$ module use /projects/easybuild/tests/a/modules/all

	$ module load SciPy-bundle/2021.05-foss-2021a

	$ ldd $EBROOTSCIPYMINBUNDLE/lib64/python3.9/site-packages/numpy/linalg/_umath_linalg.cpython-39-aarch64-linux-gnu.so \| grep blas
	libflexiblas.so.3 => /share/apps-arm/ohpc/easybuild/software/rocky/8.5/a64fx/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib/libflexiblas.so.3 (0x00004000000d0000)

	$ for nt in 1 2 4 8 24 48; do echo -n "nthreads=$nt; "; OMP_NUM_THREADS=$nt python -m timeit -n 3 -r 3 -s "import numpy; x = numpy.random.random((1000, 1000))" "numpy.dot(x, x.T)"; done
	nthreads=1; 3 loops, best of 3: 314 msec per loop
	nthreads=2; 3 loops, best of 3: 162 msec per loop
	nthreads=4; 3 loops, best of 3: 84.5 msec per loop
	nthreads=8; 3 loops, best of 3: 46.3 msec per loop
	nthreads=24; 3 loops, best of 3: 20.6 msec per loop
	nthreads=48; 3 loops, best of 3: 15.1 msec per loop
	:0: UserWarning: The test results are likely unreliable. The worst time (230 msec) was more than four times slower than the best time (15.1 msec).

	$ module purge; module load SciPy-bundle/2021.05-Fujitsu-21.05_openmp

	$ ldd $EBROOTSCIPYMINBUNDLE/lib64/python3.9/site-packages/numpy/linalg/_umath_linalg.cpython-39-aarch64-linux-gnu.so \| grep lapack
	libfjlapackexsve.so.1 => /opt/FJSVstclanga/cp-1.0.21.02a/lib64/libfjlapackexsve.so.1 (0x00004000000c0000)

	$ for nt in 1 2 4 8 24 48; do echo -n "nthreads=$nt; "; OMP_NUM_THREADS=$nt python -m timeit -n 3 -r 3 -s "import numpy; x = numpy.random.random((1000, 1000))" "numpy.dot(x, x.T)"; done
	nthreads=1; 3 loops, best of 3: 34.6 msec per loop
	nthreads=2; 3 loops, best of 3: 21.1 msec per loop
	nthreads=4; 3 loops, best of 3: 13.7 msec per loop
	nthreads=8; 3 loops, best of 3: 9.98 msec per loop
	nthreads=24; 3 loops, best of 3: 7.63 msec per loop
	nthreads=48; 3 loops, best of 3: 7.55 msec per loop
	:0: UserWarning: The test results are likely unreliable. The worst time (36.3 msec) was more than four times slower than the best time (7.55 msec).

	### matrix is too small when using many threads, increasing the size

	$ module purge; module load SciPy-bundle/2021.05-foss-2021a
	$ for nt in 1 2 4 8 24 48; do echo -n "nthreads=$nt; "; OMP_NUM_THREADS=$nt python -m timeit -n 3 -r 3 -s "import numpy; x = numpy.random.random((5000, 5000))" "numpy.dot(x, x.T)"; done
	nthreads=1; 3 loops, best of 3: 38.8 sec per loop
	nthreads=2; 3 loops, best of 3: 19.8 sec per loop
	nthreads=4; 3 loops, best of 3: 10.2 sec per loop
	nthreads=8; 3 loops, best of 3: 5.42 sec per loop
	nthreads=24; 3 loops, best of 3: 2.22 sec per loop
	nthreads=48; 3 loops, best of 3: 1.5 sec per loop

	$ module purge; module load SciPy-bundle/2021.05-Fujitsu-21.05_openmp
	$ for nt in 1 2 4 8 24 48; do echo -n "nthreads=$nt; "; OMP_NUM_THREADS=$nt python -m timeit -n 3 -r 3 -s "import numpy; x = numpy.random.random((5000, 5000))" "numpy.dot(x, x.T)"; done
	nthreads=1; 3 loops, best of 3: 3.13 sec per loop
	nthreads=2; 3 loops, best of 3: 1.65 sec per loop
	nthreads=4; 3 loops, best of 3: 898 msec per loop
	nthreads=8; 3 loops, best of 3: 547 msec per loop
	nthreads=24; 3 loops, best of 3: 317 msec per loop
	nthreads=48; 3 loops, best of 3: 270 msec per loop

	### on a64fx, SSL2 is ~10x faster than OpenBLAS (at least this version of OpenBLAS, 0.3.15)
	Running motorBike OpenFOAM example with different (.org) versions and toolchains:
	(obtained with ~/fujitsu/FCC-2022a_gcccore_largepage/run_openfoam_motorbike.script)
	OpenFOAM/10-gompi-2022a: 470.816562465 seconds
	OpenFOAM/10-ffmpi-2022a: 237.087816735 seconds (~2x faster with Fujitsu )
	OpenFOAM/11-gompi-2022a: 360.682005787 seconds
	OpenFOAM/11-ffmpi-2022a: 40.391527638 seconds (hm, ~9x faster? or something wrong?)

	Some .com versions (v2206 and v2312) are also installed with both gompi and ffmpi, but some utilities seem to be missing when built with ffmpi and even with gompi there seem to be issues running the motorBike example (TODO)