jiewmeng/matrix.cpp

## matrix.cpp
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <assert.h>
#include <algorithm>


typedef struct
{
	int size;
	float ** element;
} matrix;

/**
 * Returns the number of nanoseconds since the beginning of the program
 * Requires linking with librt (-lrt flag on GCC)
 **/
long long wall_clock_time()
{
#ifdef __linux__
	struct timespec tp;
	clock_gettime(CLOCK_REALTIME, &tp);
	return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll);
#else
	#warning "Your timer resoultion might be too low. Compile on Linux and link with librt"
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll);
#endif
}

/**
 * Allocates memory for a matrix of size SIZE
 * The memory is allocated row-major order, i.e.
 *  elements from the same row are allocated at contiguous
 *  memory addresses.
 **/
void allocate_matrix(matrix* m, int size) {
	int i, j;
	m->size = size;

	// allocate array for all the rows
	m->element = (float**)malloc(sizeof(float*) * size);
	if (m->element == NULL) {
		fprintf(stderr, "Out of memory\n");
		exit(1);
	}

	// allocate an array for each row of the matrix
	for	(i = 0; i < size; i++) {
		m->element[i] = (float*)malloc(sizeof(float) * size);
		if (m->element[i] == NULL)
		{
			fprintf(stderr, "Out of memory\n");
			exit(1);
		}
	}
}

/**
 * Initializes the elements of the matrix with
 * random values between 0 and 9
 **/
void init_matrix(matrix m)
{
	int i, j;
	int size = m.size;


	for (i = 0; i < size; i++)
		for (j = 0; j < size; j++)
		{
			m.element[i][j] = rand() % 10;
		}
}

/**
 * Initializes the elements of the matrix with
 * element 0.
 **/
void init_matrix_zero(matrix m)
{
	int i, j;
	int size = m.size;

	for (i = 0; i < size; i++)
		for (j = 0; j < size; j++)
		{
			m.element[i][j] = 0.0;
		}
}


/**
 * Frees the memory of matrix @m
 **/
void free_matrix(matrix m)
{
	int i;
	int size = m.size;

	for (i = 0; i < size; i++)
		free(m.element[i]);

	free(m.element);
}

void transpose(int size, matrix m) {
	int i, j;
	for (i = 0; i < size; i++)
		for (j = 0; j < size; j++)
			std::swap(m.element[i][j], m.element[j][i]);
}

/**
 * Multiplies matrix @a with matrix @b storing
 * the result in matrix @result
 *
 * The multiplication algorithm is the O(n^3)
 * algorithm
 */
void mm(matrix a, matrix b, matrix result) {
	int i, j, k;
	int size = a.size;
	long long before, after;

	before = wall_clock_time();
	// Do the multiplication
	transpose(size, b); // transpose the matrix to reduce cache miss
	for (i = 0; i < size; i++)
		for (j = 0; j < size; j++) {
			int tmp = 0; // save memory writes
			for(k = 0; k < size; k++)
				tmp += a.element[i][k] * b.element[j][k];
			result.element[i][j] = tmp;
		}
    //transpose(size, b);
	after = wall_clock_time();
	fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(after - before))/1000000000);
}


void print_matrix(matrix m)
{
	int i, j;
	int size = m.size;

	for (i = 0; i < size; i++)
	{
		printf("row =%4d: ", i);
		for (j = 0; j < size; j++)
			printf("%6.2f  ", m.element[i][j]);
		printf("\n");
	}
}


void work(int size)
{
	matrix a, b, result;

	// Allocate memory for matrices
	allocate_matrix(&a, size);
	allocate_matrix(&b, size);
	allocate_matrix(&result, size);

	// Initialize matrix elements
	init_matrix(a);
	init_matrix(b);
	init_matrix_zero(result);

	// Perform sequential matrix multiplication
	mm(a, b, result);

	// Print the result matrix
	//print_matrix(result);

	free_matrix(a);
	free_matrix(b);
	free_matrix(result);

}


int main(int argc, char ** argv)
{
	int size;
	srand(0);

	if (argc >= 2)
		size = atoi(argv[1]);
	else
		size = 1024;

	fprintf(stderr,"Sequential matrix multiplication of size %d\n", size);

	// Multiply the matrices
	work(size);

	return 0;
}

## matrix2.cpp
#define SIZE 1024

#include <iostream>
#include <string>
#include <cstdio>
#include <stdlib.h>
using namespace std;

long long wall_clock_time();
void zero_matrix(float m[SIZE][SIZE]);
void transpose_matrix(float m[SIZE][SIZE]);
void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]);
void print_matrix(float m[SIZE][SIZE]);
void random_matrix(float m[SIZE][SIZE]);

void zero_matrix(float m[SIZE][SIZE]) {
	for (int i = 0; i < SIZE; i++)
		for (int j = 0; j < SIZE; j++)
			m[i][j] = 0;
}

void transpose_matrix(float m[SIZE][SIZE]) {
	for (int i = 0; i < SIZE; i++)
		for (int j = i + 1; j < SIZE; j++)
			swap(m[i][j], m[j][i]);
}

void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]) {
	long long start, end;

	start = wall_clock_time();
	transpose_matrix(m2);
	for (int i = 0; i < SIZE; i++) {
		for (int j = 0; j < SIZE; j++) {
			float tmp = 0;
			for (int k = 0; k < SIZE; k++)
				tmp += m1[i][k] * m2[j][k];
			result[i][j] = tmp;
		}
	}
	end = wall_clock_time();
	fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(end - start))/1000000000);
}

void print_matrix(float m[SIZE][SIZE]) {
	for (int i = 0; i < SIZE; i++) {
		for (int j = 0; j < SIZE; j++) {
			cout << m[i][j] << " ";
		}
		cout << endl;
	}
}

void random_matrix(float m[SIZE][SIZE]) {
	for (int i = 0; i < SIZE; i++)
		for (int j = 0; j < SIZE; j++)
			m[i][j] = rand() % 10;
}

int main() {
	// init matrixes
	static float m1[SIZE][SIZE];
	static float m2[SIZE][SIZE];
	static float result[SIZE][SIZE];
	random_matrix(m1);
	random_matrix(m2);
	zero_matrix(result);

	// do matrix multiply
	matrix_multiply(m1, m2, result);

	// print result
	print_matrix(result);

	// cleanup


	return 0;
}


/********************************************
 * Helpers
 *******************************************/
long long wall_clock_time() {
#ifdef __linux__
	struct timespec tp;
	clock_gettime(CLOCK_REALTIME, &tp);
	return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll);
#else
	#warning "Your timer resoultion might be too low. Compile on Linux and link with librt"
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll);
#endif
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <time.h>
	#include <sys/time.h>
	#include <assert.h>
	#include <algorithm>


	typedef struct
	{
	int size;
	float ** element;
	} matrix;

	/**
	* Returns the number of nanoseconds since the beginning of the program
	* Requires linking with librt (-lrt flag on GCC)
	**/
	long long wall_clock_time()
	{
	#ifdef __linux__
	struct timespec tp;
	clock_gettime(CLOCK_REALTIME, &tp);
	return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll);
	#else
	#warning "Your timer resoultion might be too low. Compile on Linux and link with librt"
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll);
	#endif
	}

	/**
	* Allocates memory for a matrix of size SIZE
	* The memory is allocated row-major order, i.e.
	* elements from the same row are allocated at contiguous
	* memory addresses.
	**/
	void allocate_matrix(matrix* m, int size) {
	int i, j;
	m->size = size;

	// allocate array for all the rows
	m->element = (float*)malloc(sizeof(float) * size);
	if (m->element == NULL) {
	fprintf(stderr, "Out of memory\n");
	exit(1);
	}

	// allocate an array for each row of the matrix
	for (i = 0; i < size; i++) {
	m->element[i] = (float)malloc(sizeof(float) size);
	if (m->element[i] == NULL)
	{
	fprintf(stderr, "Out of memory\n");
	exit(1);
	}
	}
	}

	/**
	* Initializes the elements of the matrix with
	* random values between 0 and 9
	**/
	void init_matrix(matrix m)
	{
	int i, j;
	int size = m.size;


	for (i = 0; i < size; i++)
	for (j = 0; j < size; j++)
	{
	m.element[i][j] = rand() % 10;
	}
	}

	/**
	* Initializes the elements of the matrix with
	* element 0.
	**/
	void init_matrix_zero(matrix m)
	{
	int i, j;
	int size = m.size;

	for (i = 0; i < size; i++)
	for (j = 0; j < size; j++)
	{
	m.element[i][j] = 0.0;
	}
	}


	/**
	* Frees the memory of matrix @m
	**/
	void free_matrix(matrix m)
	{
	int i;
	int size = m.size;

	for (i = 0; i < size; i++)
	free(m.element[i]);

	free(m.element);
	}

	void transpose(int size, matrix m) {
	int i, j;
	for (i = 0; i < size; i++)
	for (j = 0; j < size; j++)
	std::swap(m.element[i][j], m.element[j][i]);
	}

	/**
	* Multiplies matrix @a with matrix @b storing
	* the result in matrix @result
	*
	* The multiplication algorithm is the O(n^3)
	* algorithm
	*/
	void mm(matrix a, matrix b, matrix result) {
	int i, j, k;
	int size = a.size;
	long long before, after;

	before = wall_clock_time();
	// Do the multiplication
	transpose(size, b); // transpose the matrix to reduce cache miss
	for (i = 0; i < size; i++)
	for (j = 0; j < size; j++) {
	int tmp = 0; // save memory writes
	for(k = 0; k < size; k++)
	tmp += a.element[i][k] * b.element[j][k];
	result.element[i][j] = tmp;
	}
	//transpose(size, b);
	after = wall_clock_time();
	fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(after - before))/1000000000);
	}


	void print_matrix(matrix m)
	{
	int i, j;
	int size = m.size;

	for (i = 0; i < size; i++)
	{
	printf("row =%4d: ", i);
	for (j = 0; j < size; j++)
	printf("%6.2f ", m.element[i][j]);
	printf("\n");
	}
	}



	void work(int size)
	{
	matrix a, b, result;

	// Allocate memory for matrices
	allocate_matrix(&a, size);
	allocate_matrix(&b, size);
	allocate_matrix(&result, size);

	// Initialize matrix elements
	init_matrix(a);
	init_matrix(b);
	init_matrix_zero(result);

	// Perform sequential matrix multiplication
	mm(a, b, result);

	// Print the result matrix
	//print_matrix(result);

	free_matrix(a);
	free_matrix(b);
	free_matrix(result);

	}


	int main(int argc, char ** argv)
	{
	int size;
	srand(0);

	if (argc >= 2)
	size = atoi(argv[1]);
	else
	size = 1024;

	fprintf(stderr,"Sequential matrix multiplication of size %d\n", size);

	// Multiply the matrices
	work(size);

	return 0;
	}
	#define SIZE 1024

	#include <iostream>
	#include <string>
	#include <cstdio>
	#include <stdlib.h>
	using namespace std;

	long long wall_clock_time();
	void zero_matrix(float m[SIZE][SIZE]);
	void transpose_matrix(float m[SIZE][SIZE]);
	void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]);
	void print_matrix(float m[SIZE][SIZE]);
	void random_matrix(float m[SIZE][SIZE]);

	void zero_matrix(float m[SIZE][SIZE]) {
	for (int i = 0; i < SIZE; i++)
	for (int j = 0; j < SIZE; j++)
	m[i][j] = 0;
	}

	void transpose_matrix(float m[SIZE][SIZE]) {
	for (int i = 0; i < SIZE; i++)
	for (int j = i + 1; j < SIZE; j++)
	swap(m[i][j], m[j][i]);
	}

	void matrix_multiply(float m1[SIZE][SIZE], float m2[SIZE][SIZE], float result[SIZE][SIZE]) {
	long long start, end;

	start = wall_clock_time();
	transpose_matrix(m2);
	for (int i = 0; i < SIZE; i++) {
	for (int j = 0; j < SIZE; j++) {
	float tmp = 0;
	for (int k = 0; k < SIZE; k++)
	tmp += m1[i][k] * m2[j][k];
	result[i][j] = tmp;
	}
	}
	end = wall_clock_time();
	fprintf(stderr, "Matrix multiplication took %1.2f seconds\n", ((float)(end - start))/1000000000);
	}

	void print_matrix(float m[SIZE][SIZE]) {
	for (int i = 0; i < SIZE; i++) {
	for (int j = 0; j < SIZE; j++) {
	cout << m[i][j] << " ";
	}
	cout << endl;
	}
	}

	void random_matrix(float m[SIZE][SIZE]) {
	for (int i = 0; i < SIZE; i++)
	for (int j = 0; j < SIZE; j++)
	m[i][j] = rand() % 10;
	}

	int main() {
	// init matrixes
	static float m1[SIZE][SIZE];
	static float m2[SIZE][SIZE];
	static float result[SIZE][SIZE];
	random_matrix(m1);
	random_matrix(m2);
	zero_matrix(result);

	// do matrix multiply
	matrix_multiply(m1, m2, result);

	// print result
	print_matrix(result);

	// cleanup


	return 0;
	}




	/********************************************
	* Helpers
	*******************************************/
	long long wall_clock_time() {
	#ifdef __linux__
	struct timespec tp;
	clock_gettime(CLOCK_REALTIME, &tp);
	return (long long)(tp.tv_nsec + (long long)tp.tv_sec * 1000000000ll);
	#else
	#warning "Your timer resoultion might be too low. Compile on Linux and link with librt"
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return (long long)(tv.tv_usec * 1000 + (long long)tv.tv_sec * 1000000000ll);
	#endif
	}