rbitr/_summary.md

## _summary.md

      
    Raw
  

              _summary.md
            
          
    Here I compared the effect of different compiler optimizations in both Fortran and C for a program that multiplies a matrix with a vector. The results are below.


Options
C (loop)
Fortran (intrinsic)
Fortran (loop)


828 ms
104 ms
835 ms


-Ofast
110 ms
112 ms
110 ms


-O3
362 ms
361 ms
363 ms


-O3 -march=native
362 ms
363 ms
361 ms


-O3 -march=native -ffast-math -funroll-loops
90.3 ms
92.8 ms
89.5 ms


-O3 -march=native -ffast-math -funroll-loops -fopenmp
85.2 ms
91.2 ms
86.4 ms


I wanted to understand what differences if any exist between C and Fortran and how the options impact them. And I wanted to know how Fortran's intrinsic matmul compared with manualy writing a loop, particularly for multiplying a vector by a matrix with is just a simple nested loop and easily parallelized. I'm working on a Fortran implementation of LLM inference  and want to understand how it can be optimized.
The programs and a script to run them are attached. I ran it on my "Intel Core i7 vPro 9th Gen" Lenovo laptop with Ubuntu 20.04 and using gfortran-10 and gcc-10. I used a matrix size of 20,000x20,000 and 10 iterations.
The numbers presented are from a random run that I did and were not selected for any special reason. Having run it several times, there are ~2-3 ms of uncertainty in some of the results, so don't read too much into small differences.
Overall, C vs Fortran is a wash which shouldn't be a surprise. We get about the same performance for hand made matmul with all the options on with and without parallelization.
The most noteworthy thing is how much faster Fortran intrinsic matmul works out of the box without any optimization, and how that gets ruined with -O3 on. Naively I thought adding -O3 was a good generic way to get a speedup but clearly that's not true here. Otherwise, -Ofast speeds up the hand implementations but still slightly degrades the Fortran intrinsic performance. And intrinsic performance is sped up by --fast-math and --funroll-loops.
The biggest loser here (other and -O3) is parallelism. Running on 12 cores, parallelizing the loops is only knocking a few ms off of the time.
I'd like to know if there are other obvious optimizations here that could make this code faster. Currently I think I need to look elsewhere for speedup, but be careful of the compiler options I use.

  
## mmc.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>


long time_in_ms() {
    struct timespec time;
    clock_gettime(CLOCK_REALTIME, &time);
    return time.tv_sec * 1000 + time.tv_nsec / 1000000;
}

void matmul(float* xout, float* x, float* w, int n, int d) {
    int i;
    #pragma omp parallel for private(i)
    for (i = 0; i < d; i++) {
        float val = 0.0f;
        for (int j = 0; j < n; j++) {
            val += w[i * n + j] * x[j];
        }
        xout[i] = val;
    }
}

int main(int argc, char *argv[]) {

int i = 1;
int n = 100;
long t0, t1;


	for (int j=1;j<argc;j++) {
	//printf("%s\n", argv[i]);
	if (argv[j][1] == 'n') {
		j++;
		n = atoi(argv[j]);
	}
	else if (argv[j][1] == 'i') {
		j++;
		i = atoi(argv[j]);
	}
	else {
		printf ("invalid parameter\n");
	return 0;
	}
	}

	printf("iterations: %d, array dimension: %d\n", i, n);

	float *a, *b, *c;

	a = malloc(n*n*sizeof(float));
	b = malloc(n*sizeof(float));
	c = malloc(n*sizeof(float));

	t0 = time_in_ms();
	for (int k=0;k<n*n;k++) {
	a[k] = ((float)rand())/RAND_MAX-0.5;

	}

	for (int k=0;k<n;k++) {
        b[k] = ((float)rand())/RAND_MAX-0.5;

        }
	t1 = time_in_ms();

	printf("filled arrays in %ld ms\n", t1-t0);

	// matmuls
	t0 = time_in_ms();
	for (int k=0;k<i;k++) {
	matmul(c,b,a,n,n);
	}
	t1 = time_in_ms();

	printf("multiplications in %ld ms\n", t1-t0);
	printf("%f ms per iteration\n", ((double)(t1-t0))/i);

	return 0;

}

## mmf.f90

module arg_parse
        implicit none

        type args
                integer :: n, i
                logical :: verbose
        end type args

        contains

                subroutine parse_args(arg_values)
                        type(args) :: arg_values
                        integer :: i, num_args
                        character(256) :: arg


                        !defaults
                        arg_values%n = 10000
                        arg_values%i = 1
                        arg_values%verbose = .false.

                        num_args = command_argument_count()

                        i = 1
                        do while (i <= num_args)
                                call get_command_argument(i, arg)
                                        select case (arg)
                                                case ('-n', '--vector_length')
                                                ! multiply nx1 vector by nxn matrix
                                                call get_command_argument(i+1, arg)
                                                read(arg,*) arg_values%n
                                                i = i + 2
                                                case ('-i', '--iterations')
                                                ! multiply nx1 vector by nxn matrix
                                                call get_command_argument(i+1, arg)
                                                read(arg,*) arg_values%i
                                                i = i + 2
                                                case ('-v', '--verbose')
                                                ! print additional information
                                                arg_values%verbose = .true.
                                                i = i + 1
                                                case default
                                                print *, 'Unrecognized option:', trim(arg)
                                                stop
                                                end select
                        end do


                end subroutine

end module arg_parse


program mm
        use arg_parse
        implicit none

        integer, parameter :: wp = kind(1.0)
        type(args) :: arg_values
        real(kind=wp), allocatable :: a(:,:)
        real(kind=wp), allocatable :: b(:)
        real(kind=wp), allocatable :: c(:)
        real(kind=wp) :: start, finish
        integer :: n, i

        call parse_args(arg_values)

        n = arg_values%n

        print *, "n = ", n

        allocate(a(n,n))
        allocate(b(n))
        allocate(c(n))
        c(:) = 0

        start = time_ms()
        call random_number(a)
        call random_number(b)
        finish = time_ms()

        print *, "Generated random matrices in ", finish-start, "ms"

        start = time_ms()
        do i=1,arg_values%i
        c = matmul(b,a)
        end do
        finish = time_ms()

        print *, "Intrinsic matmul"
        print *, "Total time ", finish-start, "ms"
        print *, (finish-start)/arg_values%i, "ms/iterations"


        start = time_ms()
        do i=1,arg_values%i
        c = p_matmul(b,a)
        end do
        finish = time_ms()

        print *, "Hand written matmul"
        print *, "Total time ", finish-start, "ms"
        print *, (finish-start)/arg_values%i, "ms/iterations"

contains

        function time_ms() result(t_ms)
                real(kind=wp) :: t_ms
                integer(4) :: ms
                !call cpu_time(t_ms)
                call system_clock(ms)
                t_ms = real(ms)
        end function


        function p_matmul(a,b) result(c)
                real(kind=wp) :: a(:)
                real(kind=wp) :: b(:,:)
                integer, allocatable :: s(:)
                real(kind=wp), allocatable :: c(:)
                integer :: i,j
                real(kind=wp) :: val

                s = shape(b)
                allocate(c(s(2)))
                c (:) = 0

                !$OMP PARALLEL DO PRIVATE(i)
                do i=1,s(2)
                        val = 0.0
                        do j=1,size(a)
                                val = val + a(j)*b(j,i)
                        end do
                        c(i) = val

                end do
                !$OMP END PARALLEL DO

        end function


end program mm

## run.sh
#!/bin/bash

cc="gcc-10"
ff="gfortran-10"
args=("" "-Ofast" "-O3" "-O3 -march=native"
"-O3 -march=native -ffast-math -funroll-loops"
"-O3 -march=native -ffast-math -funroll-loops -fopenmp")


for a in "${args[@]}"
do
	echo -e "\nArguments: "$a
	echo -e "\nC Program"
	$cc $a mmc.c -o mmc
	./mmc -n 20000 -i 10
	echo -e "\nFortran"
	$ff $a mmf.f90 -o mmf
	./mmf -n 20000 -i 10

done
Options	C (loop)	Fortran (intrinsic)	Fortran (loop)
	828 ms	104 ms	835 ms
-Ofast	110 ms	112 ms	110 ms
-O3	362 ms	361 ms	363 ms
-O3 -march=native	362 ms	363 ms	361 ms
-O3 -march=native -ffast-math -funroll-loops	90.3 ms	92.8 ms	89.5 ms
-O3 -march=native -ffast-math -funroll-loops -fopenmp	85.2 ms	91.2 ms	86.4 ms
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>


	long time_in_ms() {
	struct timespec time;
	clock_gettime(CLOCK_REALTIME, &time);
	return time.tv_sec * 1000 + time.tv_nsec / 1000000;
	}

	void matmul(float* xout, float* x, float* w, int n, int d) {
	int i;
	#pragma omp parallel for private(i)
	for (i = 0; i < d; i++) {
	float val = 0.0f;
	for (int j = 0; j < n; j++) {
	val += w[i * n + j] * x[j];
	}
	xout[i] = val;
	}
	}

	int main(int argc, char *argv[]) {

	int i = 1;
	int n = 100;
	long t0, t1;


	for (int j=1;j<argc;j++) {
	//printf("%s\n", argv[i]);
	if (argv[j][1] == 'n') {
	j++;
	n = atoi(argv[j]);
	}
	else if (argv[j][1] == 'i') {
	j++;
	i = atoi(argv[j]);
	}
	else {
	printf ("invalid parameter\n");
	return 0;
	}
	}

	printf("iterations: %d, array dimension: %d\n", i, n);

	float a, b, *c;

	a = malloc(nnsizeof(float));
	b = malloc(n*sizeof(float));
	c = malloc(n*sizeof(float));

	t0 = time_in_ms();
	for (int k=0;k<n*n;k++) {
	a[k] = ((float)rand())/RAND_MAX-0.5;

	}

	for (int k=0;k<n;k++) {
	b[k] = ((float)rand())/RAND_MAX-0.5;

	}
	t1 = time_in_ms();

	printf("filled arrays in %ld ms\n", t1-t0);

	// matmuls
	t0 = time_in_ms();
	for (int k=0;k<i;k++) {
	matmul(c,b,a,n,n);
	}
	t1 = time_in_ms();

	printf("multiplications in %ld ms\n", t1-t0);
	printf("%f ms per iteration\n", ((double)(t1-t0))/i);

	return 0;

	}

	module arg_parse
	implicit none

	type args
	integer :: n, i
	logical :: verbose
	end type args

	contains

	subroutine parse_args(arg_values)
	type(args) :: arg_values
	integer :: i, num_args
	character(256) :: arg



	!defaults
	arg_values%n = 10000
	arg_values%i = 1
	arg_values%verbose = .false.

	num_args = command_argument_count()

	i = 1
	do while (i <= num_args)
	call get_command_argument(i, arg)
	select case (arg)
	case ('-n', '--vector_length')
	! multiply nx1 vector by nxn matrix
	call get_command_argument(i+1, arg)
	read(arg,*) arg_values%n
	i = i + 2
	case ('-i', '--iterations')
	! multiply nx1 vector by nxn matrix
	call get_command_argument(i+1, arg)
	read(arg,*) arg_values%i
	i = i + 2
	case ('-v', '--verbose')
	! print additional information
	arg_values%verbose = .true.
	i = i + 1
	case default
	print *, 'Unrecognized option:', trim(arg)
	stop
	end select
	end do


	end subroutine

	end module arg_parse


	program mm
	use arg_parse
	implicit none

	integer, parameter :: wp = kind(1.0)
	type(args) :: arg_values
	real(kind=wp), allocatable :: a(:,:)
	real(kind=wp), allocatable :: b(:)
	real(kind=wp), allocatable :: c(:)
	real(kind=wp) :: start, finish
	integer :: n, i

	call parse_args(arg_values)

	n = arg_values%n

	print *, "n = ", n

	allocate(a(n,n))
	allocate(b(n))
	allocate(c(n))
	c(:) = 0

	start = time_ms()
	call random_number(a)
	call random_number(b)
	finish = time_ms()

	print *, "Generated random matrices in ", finish-start, "ms"

	start = time_ms()
	do i=1,arg_values%i
	c = matmul(b,a)
	end do
	finish = time_ms()

	print *, "Intrinsic matmul"
	print *, "Total time ", finish-start, "ms"
	print *, (finish-start)/arg_values%i, "ms/iterations"


	start = time_ms()
	do i=1,arg_values%i
	c = p_matmul(b,a)
	end do
	finish = time_ms()

	print *, "Hand written matmul"
	print *, "Total time ", finish-start, "ms"
	print *, (finish-start)/arg_values%i, "ms/iterations"

	contains

	function time_ms() result(t_ms)
	real(kind=wp) :: t_ms
	integer(4) :: ms
	!call cpu_time(t_ms)
	call system_clock(ms)
	t_ms = real(ms)
	end function


	function p_matmul(a,b) result(c)
	real(kind=wp) :: a(:)
	real(kind=wp) :: b(:,:)
	integer, allocatable :: s(:)
	real(kind=wp), allocatable :: c(:)
	integer :: i,j
	real(kind=wp) :: val

	s = shape(b)
	allocate(c(s(2)))
	c (:) = 0

	!$OMP PARALLEL DO PRIVATE(i)
	do i=1,s(2)
	val = 0.0
	do j=1,size(a)
	val = val + a(j)*b(j,i)
	end do
	c(i) = val

	end do
	!$OMP END PARALLEL DO

	end function



	end program mm
	#!/bin/bash

	cc="gcc-10"
	ff="gfortran-10"
	args=("" "-Ofast" "-O3" "-O3 -march=native"
	"-O3 -march=native -ffast-math -funroll-loops"
	"-O3 -march=native -ffast-math -funroll-loops -fopenmp")



	for a in "${args[@]}"
	do
	echo -e "\nArguments: "$a
	echo -e "\nC Program"
	$cc $a mmc.c -o mmc
	./mmc -n 20000 -i 10
	echo -e "\nFortran"
	$ff $a mmf.f90 -o mmf
	./mmf -n 20000 -i 10

	done