Skip to content

Instantly share code, notes, and snippets.

@vitillo
Last active April 24, 2018 05:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vitillo/5257371 to your computer and use it in GitHub Desktop.
Save vitillo/5257371 to your computer and use it in GitHub Desktop.
Eigen vs IPP, 4x4 Matrix Multiplication
#include <omp.h>
#include <iostream>
#include <immintrin.h>
#include <Eigen/Dense>
#include <ipp.h>
#define N 1000000
using namespace std;
using namespace Eigen;
double start, end;
class MatrixAVX{
public:
MatrixAVX(){
for(int i = 0; i < 16; i++){
m_matrix[i] = i;
}
}
void optMul(MatrixAVX &o, MatrixAVX &tmp) __attribute__((noinline));
private:
__attribute__((aligned(32))) double m_matrix[16];
friend ostream & operator<<(ostream &, const MatrixAVX &matrix);
};
void MatrixAVX::optMul(MatrixAVX &o, MatrixAVX &tmp){
__m256d a_line, b_line, r_line;
__m256d b_line0 = _mm256_load_pd(&o.m_matrix[0]);
__m256d b_line1 = _mm256_load_pd(&o.m_matrix[4]);
__m256d b_line2 = _mm256_load_pd(&o.m_matrix[8]);
__m256d b_line3 = _mm256_load_pd(&o.m_matrix[12]);
a_line = _mm256_set1_pd(m_matrix[0]);
r_line = _mm256_mul_pd(a_line, b_line0);
a_line = _mm256_set1_pd(m_matrix[1]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line1), r_line);
a_line = _mm256_set1_pd(m_matrix[2]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line2), r_line);
a_line = _mm256_set1_pd(m_matrix[3]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line3), r_line);
_mm256_store_pd(&tmp.m_matrix[0], r_line);
a_line = _mm256_set1_pd(m_matrix[4]);
r_line = _mm256_mul_pd(a_line, b_line0);
a_line = _mm256_set1_pd(m_matrix[5]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line1), r_line);
a_line = _mm256_set1_pd(m_matrix[6]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line2), r_line);
a_line = _mm256_set1_pd(m_matrix[7]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line3), r_line);
_mm256_store_pd(&tmp.m_matrix[4], r_line);
a_line = _mm256_set1_pd(m_matrix[8]);
r_line = _mm256_mul_pd(a_line, b_line0);
a_line = _mm256_set1_pd(m_matrix[9]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line1), r_line);
a_line = _mm256_set1_pd(m_matrix[10]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line2), r_line);
a_line = _mm256_set1_pd(m_matrix[11]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line3), r_line);
_mm256_store_pd(&tmp.m_matrix[8], r_line);
a_line = _mm256_set1_pd(m_matrix[12]);
r_line = _mm256_mul_pd(a_line, b_line0);
a_line = _mm256_set1_pd(m_matrix[13]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line1), r_line);
a_line = _mm256_set1_pd(m_matrix[14]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line2), r_line);
a_line = _mm256_set1_pd(m_matrix[15]);
r_line = _mm256_add_pd(_mm256_mul_pd(a_line, b_line3), r_line);
_mm256_store_pd(&tmp.m_matrix[12], r_line);
}
ostream & operator<<(ostream &stream, const MatrixAVX &matrix){
for(int i = 0; i < 4; i++){
for(int j = 0; j < 4; j++)
cout << matrix.m_matrix[i*4 + j] << " ";
if(i != 3)
cout << endl;
}
return stream;
}
void start_clock(){
start = omp_get_wtime();
}
double stop_clock(){
return omp_get_wtime() - start;
}
void SIMD_multiply(){
MatrixAVX a, b, c;
start_clock();
for(int i = 0; i < N; i++)
a.optMul(b, c);
cout << "AVX SIMD: " << stop_clock() << endl << c << endl;
}
void _Eigen_multiply(const Matrix4d &a, const Matrix4d &b, Matrix4d &c) __attribute__((noinline));
void _Eigen_multiply(const Matrix4d &a, const Matrix4d &b, Matrix4d &c){
c.noalias() = a *b;
}
void Eigen_multiply(){
Matrix4d a, b, c;
a << 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15;
b << 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15;
start_clock();
for(int i = 0; i < N; i++)
_Eigen_multiply(a, b, c);
cout << "Eigen: " << stop_clock() << endl << c << endl;;
}
void IPP_multiply(){
double timing;
__attribute__((aligned(32))) Ipp64f a[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
__attribute__((aligned(32))) Ipp64f b[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
__attribute__((aligned(32))) Ipp64f c[16];
int stride1 = sizeof(Ipp64f)*4;
int stride2 = sizeof(Ipp64f);
start_clock();
for(int i = 0; i < N; i++)
ippmMul_mm_64f(a, stride1, stride2, 4, 4, b, stride1, stride2, 4, 4, c, stride1, stride2);
cout << "IPP: " << stop_clock() << endl;
for(int i = 0; i < 4; i++){
for(int j = 0; j < 4; j++)
cout << c[i*4 + j] << " ";
cout << endl;
}
}
int main(){
SIMD_multiply();
Eigen_multiply();
IPP_multiply();
}
CC=g++
CXXFLAGS=-O2 -fopenmp -g -mavx -march=native -I/opt/intel/composerxe/ipp/include -Iinclude/eigen3
LDFLAGS = -lgomp -lipps -lippm
all: main.o testAVX.o testAVX.h
$(CC) $(LDFLAGS) main.o -o kernel
main.o: main.cpp testAVX.h
$(CC) $(CXXFLAGS) -c main.cpp
clean:
-rm -rf *.o kernel
@facontidavide
Copy link

Results?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment