Skip to content

Instantly share code, notes, and snippets.

@jiewmeng
Created October 3, 2012 03:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jiewmeng/3824894 to your computer and use it in GitHub Desktop.
Save jiewmeng/3824894 to your computer and use it in GitHub Desktop.
Matrix Multiplication Optimization
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <assert.h>
#include <algorithm>
typedef struct
{
int size;
float ** element;
} matrix;
/**
* Returns the number of nanoseconds since the beginning of the program
* Requires linking with librt (-lrt flag on GCC)
**/
long long wall_clock_time()
{
#ifdef __linux__
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll);
#else
#warning "Your timer resoultion might be too low. Compile on Linux and link with librt"
struct timeval tv;
gettimeofday(&tv, NULL);
return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll);
#endif
}
/**
* Allocates memory for a matrix of size SIZE
* The memory is allocated row-major order, i.e.
* elements from the same row are allocated at contiguous
* memory addresses.
**/
void allocate_matrix(matrix* m, int size) {
int i, j;
m->size = size;
// allocate array for all the rows
m->element = (float**)malloc(sizeof(float*) * size);
if (m->element == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
// allocate an array for each row of the matrix
for (i = 0; i < size; i++) {
m->element[i] = (float*)malloc(sizeof(float) * size);
if (m->element[i] == NULL)
{
fprintf(stderr, "Out of memory\n");
exit(1);
}
}
}
/**
* Initializes the elements of the matrix with
* random values between 0 and 9
**/
void init_matrix(matrix m)
{
int i, j;
int size = m.size;
for (i = 0; i < size; i++)
for (j = 0; j < size; j++)
{
m.element[i][j] = rand() % 10;
}
}
/**
* Initializes the elements of the matrix with
* element 0.
**/
void init_matrix_zero(matrix m)
{
int i, j;
int size = m.size;
for (i = 0; i < size; i++)
for (j = 0; j < size; j++)
{
m.element[i][j] = 0.0;
}
}
/**
* Frees the memory of matrix @m
**/
void free_matrix(matrix m)
{
int i;
int size = m.size;
for (i = 0; i < size; i++)
free(m.element[i]);
free(m.element);
}
void transpose(int size, matrix m) {
int i, j;
for (i = 0; i < size; i++)
for (j = 0; j < size; j++)
std::swap(m.element[i][j], m.element[j][i]);
}
/**
* Multiplies matrix @a with matrix @b storing
* the result in matrix @result
*
* The multiplication algorithm is the O(n^3)
* algorithm
*/
void mm(matrix a, matrix b, matrix result) {
int i, j, k;
int size = a.size;
long long before, after;
before = wall_clock_time();
// Do the multiplication
transpose(size, b); // transpose the matrix to reduce cache miss
for (i = 0; i < size; i++)
for (j = 0; j < size; j++) {
int tmp = 0; // save memory writes
for(k = 0; k < size; k++)
tmp += a.element[i][k] * b.element[j][k];
result.element[i][j] = tmp;
}
//transpose(size, b);
after = wall_clock_time();
fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(after - before))/1000000000);
}
void print_matrix(matrix m)
{
int i, j;
int size = m.size;
for (i = 0; i < size; i++)
{
printf("row =%4d: ", i);
for (j = 0; j < size; j++)
printf("%6.2f ", m.element[i][j]);
printf("\n");
}
}
void work(int size)
{
matrix a, b, result;
// Allocate memory for matrices
allocate_matrix(&a, size);
allocate_matrix(&b, size);
allocate_matrix(&result, size);
// Initialize matrix elements
init_matrix(a);
init_matrix(b);
init_matrix_zero(result);
// Perform sequential matrix multiplication
mm(a, b, result);
// Print the result matrix
//print_matrix(result);
free_matrix(a);
free_matrix(b);
free_matrix(result);
}
int main(int argc, char ** argv)
{
int size;
srand(0);
if (argc >= 2)
size = atoi(argv[1]);
else
size = 1024;
fprintf(stderr,"Sequential matrix multiplication of size %d\n", size);
// Multiply the matrices
work(size);
return 0;
}
#define SIZE 1024
#include <iostream>
#include <string>
#include <cstdio>
#include <stdlib.h>
using namespace std;
long long wall_clock_time();
void zero_matrix(float m[SIZE][SIZE]);
void transpose_matrix(float m[SIZE][SIZE]);
void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]);
void print_matrix(float m[SIZE][SIZE]);
void random_matrix(float m[SIZE][SIZE]);
void zero_matrix(float m[SIZE][SIZE]) {
for (int i = 0; i < SIZE; i++)
for (int j = 0; j < SIZE; j++)
m[i][j] = 0;
}
void transpose_matrix(float m[SIZE][SIZE]) {
for (int i = 0; i < SIZE; i++)
for (int j = i + 1; j < SIZE; j++)
swap(m[i][j], m[j][i]);
}
void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]) {
long long start, end;
start = wall_clock_time();
transpose_matrix(m2);
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < SIZE; j++) {
float tmp = 0;
for (int k = 0; k < SIZE; k++)
tmp += m1[i][k] * m2[j][k];
result[i][j] = tmp;
}
}
end = wall_clock_time();
fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(end - start))/1000000000);
}
void print_matrix(float m[SIZE][SIZE]) {
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < SIZE; j++) {
cout << m[i][j] << " ";
}
cout << endl;
}
}
void random_matrix(float m[SIZE][SIZE]) {
for (int i = 0; i < SIZE; i++)
for (int j = 0; j < SIZE; j++)
m[i][j] = rand() % 10;
}
int main() {
// init matrixes
static float m1[SIZE][SIZE];
static float m2[SIZE][SIZE];
static float result[SIZE][SIZE];
random_matrix(m1);
random_matrix(m2);
zero_matrix(result);
// do matrix multiply
matrix_multiply(m1, m2, result);
// print result
print_matrix(result);
// cleanup
return 0;
}
/********************************************
* Helpers
*******************************************/
long long wall_clock_time() {
#ifdef __linux__
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll);
#else
#warning "Your timer resoultion might be too low. Compile on Linux and link with librt"
struct timeval tv;
gettimeofday(&tv, NULL);
return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll);
#endif
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment