public
Created

Matrix Multiplication Optimization

  • Download Gist
matrix.cpp
C++
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <assert.h>
#include <algorithm>
 
 
typedef struct
{
int size;
float ** element;
} matrix;
 
/**
* Returns the number of nanoseconds since the beginning of the program
* Requires linking with librt (-lrt flag on GCC)
**/
long long wall_clock_time()
{
#ifdef __linux__
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll);
#else
#warning "Your timer resoultion might be too low. Compile on Linux and link with librt"
struct timeval tv;
gettimeofday(&tv, NULL);
return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll);
#endif
}
 
/**
* Allocates memory for a matrix of size SIZE
* The memory is allocated row-major order, i.e.
* elements from the same row are allocated at contiguous
* memory addresses.
**/
void allocate_matrix(matrix* m, int size) {
int i, j;
m->size = size;
// allocate array for all the rows
m->element = (float**)malloc(sizeof(float*) * size);
if (m->element == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
// allocate an array for each row of the matrix
for (i = 0; i < size; i++) {
m->element[i] = (float*)malloc(sizeof(float) * size);
if (m->element[i] == NULL)
{
fprintf(stderr, "Out of memory\n");
exit(1);
}
}
}
 
/**
* Initializes the elements of the matrix with
* random values between 0 and 9
**/
void init_matrix(matrix m)
{
int i, j;
int size = m.size;
for (i = 0; i < size; i++)
for (j = 0; j < size; j++)
{
m.element[i][j] = rand() % 10;
}
}
 
/**
* Initializes the elements of the matrix with
* element 0.
**/
void init_matrix_zero(matrix m)
{
int i, j;
int size = m.size;
for (i = 0; i < size; i++)
for (j = 0; j < size; j++)
{
m.element[i][j] = 0.0;
}
}
 
 
/**
* Frees the memory of matrix @m
**/
void free_matrix(matrix m)
{
int i;
int size = m.size;
for (i = 0; i < size; i++)
free(m.element[i]);
 
free(m.element);
}
 
void transpose(int size, matrix m) {
int i, j;
for (i = 0; i < size; i++)
for (j = 0; j < size; j++)
std::swap(m.element[i][j], m.element[j][i]);
}
 
/**
* Multiplies matrix @a with matrix @b storing
* the result in matrix @result
*
* The multiplication algorithm is the O(n^3)
* algorithm
*/
void mm(matrix a, matrix b, matrix result) {
int i, j, k;
int size = a.size;
long long before, after;
 
before = wall_clock_time();
// Do the multiplication
transpose(size, b); // transpose the matrix to reduce cache miss
for (i = 0; i < size; i++)
for (j = 0; j < size; j++) {
int tmp = 0; // save memory writes
for(k = 0; k < size; k++)
tmp += a.element[i][k] * b.element[j][k];
result.element[i][j] = tmp;
}
//transpose(size, b);
after = wall_clock_time();
fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(after - before))/1000000000);
}
 
 
void print_matrix(matrix m)
{
int i, j;
int size = m.size;
for (i = 0; i < size; i++)
{
printf("row =%4d: ", i);
for (j = 0; j < size; j++)
printf("%6.2f ", m.element[i][j]);
printf("\n");
}
}
 
 
 
void work(int size)
{
matrix a, b, result;
 
// Allocate memory for matrices
allocate_matrix(&a, size);
allocate_matrix(&b, size);
allocate_matrix(&result, size);
 
// Initialize matrix elements
init_matrix(a);
init_matrix(b);
init_matrix_zero(result);
 
// Perform sequential matrix multiplication
mm(a, b, result);
 
// Print the result matrix
//print_matrix(result);
free_matrix(a);
free_matrix(b);
free_matrix(result);
 
}
 
 
int main(int argc, char ** argv)
{
int size;
srand(0);
if (argc >= 2)
size = atoi(argv[1]);
else
size = 1024;
fprintf(stderr,"Sequential matrix multiplication of size %d\n", size);
// Multiply the matrices
work(size);
 
return 0;
}
matrix2.cpp
C++
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
#define SIZE 1024
 
#include <iostream>
#include <string>
#include <cstdio>
#include <stdlib.h>
using namespace std;
 
long long wall_clock_time();
void zero_matrix(float m[SIZE][SIZE]);
void transpose_matrix(float m[SIZE][SIZE]);
void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]);
void print_matrix(float m[SIZE][SIZE]);
void random_matrix(float m[SIZE][SIZE]);
 
void zero_matrix(float m[SIZE][SIZE]) {
for (int i = 0; i < SIZE; i++)
for (int j = 0; j < SIZE; j++)
m[i][j] = 0;
}
 
void transpose_matrix(float m[SIZE][SIZE]) {
for (int i = 0; i < SIZE; i++)
for (int j = i + 1; j < SIZE; j++)
swap(m[i][j], m[j][i]);
}
 
void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]) {
long long start, end;
 
start = wall_clock_time();
transpose_matrix(m2);
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < SIZE; j++) {
float tmp = 0;
for (int k = 0; k < SIZE; k++)
tmp += m1[i][k] * m2[j][k];
result[i][j] = tmp;
}
}
end = wall_clock_time();
fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(end - start))/1000000000);
}
 
void print_matrix(float m[SIZE][SIZE]) {
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < SIZE; j++) {
cout << m[i][j] << " ";
}
cout << endl;
}
}
 
void random_matrix(float m[SIZE][SIZE]) {
for (int i = 0; i < SIZE; i++)
for (int j = 0; j < SIZE; j++)
m[i][j] = rand() % 10;
}
 
int main() {
// init matrixes
static float m1[SIZE][SIZE];
static float m2[SIZE][SIZE];
static float result[SIZE][SIZE];
random_matrix(m1);
random_matrix(m2);
zero_matrix(result);
 
// do matrix multiply
matrix_multiply(m1, m2, result);
 
// print result
print_matrix(result);
 
// cleanup
 
return 0;
}
 
 
 
 
/********************************************
* Helpers
*******************************************/
long long wall_clock_time() {
#ifdef __linux__
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll);
#else
#warning "Your timer resoultion might be too low. Compile on Linux and link with librt"
struct timeval tv;
gettimeofday(&tv, NULL);
return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll);
#endif
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.