tell/matrix_mul.c

## matrix_mul.c
/*
 * The author and original source code are from:
 *     機械学習序曲(仮) > 数値計算の実行速度 - Python, C, Rの比較
 *     http://hawaii.naist.jp/~kohei-h/my/python-numeric-comp.html
 *
 * It is modified version in above Web page.
 *
 * This program needs data files to construct matrix.
 * A generating data files program is published in
 * original author's Web page.
 */
/*
 * A comparison routine on floating point arithmetic, is added.
 * This source code is based on:
 *     FLP35-C. 浮動小数点数値を比較する際には精度を考慮する
 *     http://www.jpcert.or.jp/sc-rules/c-flp35-c.html
 */

#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/errno.h>
#include <time.h>
#include <string.h>

#include <xmmintrin.h>

#define ABS__(x) ((x) < 0 ? -(x) : (x))
#define MAX__(x,y) ((x) < (y) ? (y) : (x))
#define FUNC_ENTRY__(name) { name, #name }
#define ALIGN__ (16)

typedef float mat_elem_t;

const size_t ALIGNMENT = sizeof(__m128);  // ALIGN__;
const size_t SHIFT_TO_ALIGN = 0;

const unsigned int NUM = 10;

void matrix_free(mat_elem_t *(mat_ptr[]), const size_t dim)
{
  size_t i;
  for (i = 0; i < dim; i++) {
    free(mat_ptr[i]);
  }
}

void matrix_set_NULL(mat_elem_t *(mat_ptr[]), const size_t dim)
{
  size_t i;
  for (i = 0; i < dim; i++) {
    mat_ptr[i] = NULL;
  }
}

void setzero(mat_elem_t *(M[]), const size_t num)
{
  size_t i, j;
  for(i = 0; i < num; i++) {
    for(j = 0; j < num; j++) {
      M[i][j] = 0;
    }
  }
}

void vector_set_zero(mat_elem_t *vec, const size_t dim)
{
  size_t i;
  for (i = 0; i < dim; i++) {
    vec[i] = 0;
  }
}

size_t num_of_aligned_row_elem(const size_t dim)
{
  const size_t n = ALIGNMENT / sizeof(mat_elem_t);
  const size_t q = dim / n;
  const size_t r = dim % n;
  return r == 0 ? q * n : (q + 1) * n;
}

int matrix_allocate(mat_elem_t *((*mat_ptr)[]), const size_t dim)
{
  size_t i;
  for (i = 0; i < dim; i++) {
    const size_t row_size = num_of_aligned_row_elem(dim);
    mat_elem_t *row_ptr;
    int alloc_ret_code = 0;

    alloc_ret_code = posix_memalign((void **)(&row_ptr), ALIGNMENT, row_size * sizeof(mat_elem_t));
    switch (alloc_ret_code) {
    case EINVAL:
    case ENOMEM:
      return alloc_ret_code;
    case 0:
      break;
    default:
      return -1;
    }
    vector_set_zero(row_ptr, row_size);
    (*mat_ptr)[i] = row_ptr;
  }
  return 0;
}

// initialize matrix
void *init(mat_elem_t *(arr[]), const char *filename, int num){
  int i, j;
  char *pt;
  FILE *fp;
  char buf[num*10];

  if (filename == NULL) fp = NULL;
  else fp = fopen(filename, "r");

  for (i = 0; i < num; i++) {
    if (fp == NULL) continue;

    fgets(buf, sizeof(buf), fp);
    arr[i][0] = atof(strtok(buf, " "));
    for (j = 1; ( pt = strtok(NULL, " ") ) != NULL; j++) {
      arr[i][j] = atof(pt);
    }
  }
}

float float_diff_normalized(const float a, const float b)
{
  float c = ABS__(a);
  float d = ABS__(b);
  d = MAX__(c, d);
  return d == 0.0 ? 0.0 : ABS__(a - b) / d;
}

int is_eq(const mat_elem_t *(L[restrict]), const mat_elem_t *(R[restrict]), const size_t num)
{
  size_t i, j;
  float d;
  for (i = 0; i < num; i++) {
    for (j = 0; j < num; j++) {
      d = float_diff_normalized(L[i][j], R[i][j]);
      if (! (d <= __FLT_EPSILON__)) {
        printf("difference found at (%lu, %lu)\n", i, j);
        printf("diff is %f\n", d);
        return 0;
      }
    }
  }
  return 1;
}

void matprint(const mat_elem_t *(M[]), const size_t num)
{
  size_t i, j;
  for (i = 0; i < num; i++) {
    for (j = 0; j < num; j++) {
      printf("%f ", M[i][j]);
    }
    printf("\n");
  }
}

void matmul_orig(mat_elem_t *(M[]), const mat_elem_t *(A[]), const mat_elem_t *(B[]), const size_t num)
{
  size_t i, j, k;
  for (i = 0; i < num; i++){
    for (j = 0; j < num; j++){
      for (k = 0; k < num; k++){
        M[i][j] += A[i][k] * B[k][j];
      }
    }
  }
}

void matmul_0(mat_elem_t *(M[]), const mat_elem_t *(A[]), const mat_elem_t *(B[]), const size_t num)
{
  mat_elem_t t;
  size_t i, j, k;
  for (j = 0; j < num; j++){
    for (i = 0; i < num; i++){
      t = A[i][j];
      for (k = 0; k < num; k++){
        M[i][k] += t * B[j][k];
      }
    }
  }
}

void matmul_1(mat_elem_t *(M[restrict]), const mat_elem_t *(A[restrict]), const mat_elem_t *(B[restrict]), const size_t num)
{
  mat_elem_t t;
  const mat_elem_t *B_row;
  mat_elem_t *M_row;
  size_t i, j, k;
  for (j = 0; j < num; j++){
    B_row = B[j];
    for (i = 0; i < num; i++){
      t = A[i][j];
      M_row = M[i];
      for (k = 0; k < num; k++){
        M_row[k] += t * B_row[k];
      }
    }
  }
}

void matmul_2(mat_elem_t *(M[restrict]), const mat_elem_t *(A[restrict]), const mat_elem_t *(B[restrict]), const size_t dim)
{
  __m128 tvec, tm;
  const __m128 *B_row;
  __m128 *M_row;
  size_t i, j, k;
  const size_t aligned_dim = (dim + 3) >> 2;

  for (j = 0; j < dim; j++){
    B_row = (const __m128 *)B[j];
    for (i = 0; i < dim; i++){
      tvec = _mm_set1_ps(A[i][j]);
      M_row = (__m128 *)M[i];
      for (k = 0; k < aligned_dim; k++){
        tm = _mm_mul_ps(tvec, B_row[k]);
        tm = _mm_add_ps(M_row[k], tm);
        _mm_stream_ps((float *)(M_row + k), tm);
      }
    }
  }
}

/**/

typedef void (*matmat_func_t)(mat_elem_t *([]), const mat_elem_t *([]), const mat_elem_t *([]), const size_t);

void initialize_test_data(mat_elem_t *((*A)[]), mat_elem_t *((*B)[]), mat_elem_t *((*M0)[]), mat_elem_t *((*M1)[]), const size_t dim)
{
  int alloc_ret_code;

  alloc_ret_code = matrix_allocate(A, dim);
  if (alloc_ret_code != 0) {
    fprintf(stderr, "%s:%d: error: return code = %d\n",
            __FILE__, __LINE__, alloc_ret_code);
  }
  alloc_ret_code = matrix_allocate(B, dim);
  if (alloc_ret_code != 0) {
    fprintf(stderr, "%s:%d: error: return code = %d\n",
            __FILE__, __LINE__, alloc_ret_code);
  }
  alloc_ret_code = matrix_allocate(M0, dim);
  if (alloc_ret_code != 0) {
    fprintf(stderr, "%s:%d: error: return code = %d\n",
            __FILE__, __LINE__, alloc_ret_code);
  }
  alloc_ret_code = matrix_allocate(M1, dim);
  if (alloc_ret_code != 0) {
    fprintf(stderr, "%s:%d: error: return code = %d\n",
            __FILE__, __LINE__, alloc_ret_code);
  }

  init(*A, "A.dat", dim);
  init(*B, "B.dat", dim);

  {
    size_t i;
    for (i = 0; i < dim; i++) {
      assert( (((uintptr_t)((*A)[i])) % ALIGNMENT ) == 0 );
      assert( (((uintptr_t)((*B)[i])) % ALIGNMENT ) == 0 );
      assert( (((uintptr_t)((*M0)[i])) % ALIGNMENT ) == 0 );
      assert( (((uintptr_t)((*M1)[i])) % ALIGNMENT ) == 0 );
    }
  }
}

void benchmark()
{
  time_t start, end;
  double diff;
  mat_elem_t *(A[NUM]), *(B[NUM]);
  mat_elem_t *(M[NUM]), *(M_orig[NUM]);
  struct {
    matmat_func_t f_;
    const char *name_;
  } func_tbl[] = {
    FUNC_ENTRY__(matmul_orig),
    FUNC_ENTRY__(matmul_0),
    FUNC_ENTRY__(matmul_1),
    FUNC_ENTRY__(matmul_2),
  };
  size_t i, flen;

  //

  initialize_test_data(&A, &B, &M_orig, &M, NUM);

  setzero(M_orig, NUM);
  func_tbl[0].f_(M_orig, A, B, NUM);

  flen = sizeof(func_tbl)/sizeof(*func_tbl);
  for (i = 0; i < flen; i++) {
    setzero(M, NUM);

    time (&start);
    func_tbl[i].f_(M, A, B, NUM);
    time (&end);

    diff = difftime(end,start);
    printf("%s : %.2f\n", func_tbl[i].name_, diff);
    if (! is_eq(M, M_orig, NUM)) {
      printf("different\n");
    }
  }
}

void check()
{
#ifdef NDEBUG
  puts("NDEBUG macro is enabled");
#else
  puts("NDEBUG macro is disabled");
#endif
}

int main()
{
  check();

  printf("dimension = %u\n", NUM);

  benchmark();
  return 0;
}
	/*
	* The author and original source code are from:
	* 機械学習序曲(仮) > 数値計算の実行速度 - Python, C, Rの比較
	* http://hawaii.naist.jp/~kohei-h/my/python-numeric-comp.html
	*
	* It is modified version in above Web page.
	*
	* This program needs data files to construct matrix.
	* A generating data files program is published in
	* original author's Web page.
	*/
	/*
	* A comparison routine on floating point arithmetic, is added.
	* This source code is based on:
	* FLP35-C. 浮動小数点数値を比較する際には精度を考慮する
	* http://www.jpcert.or.jp/sc-rules/c-flp35-c.html
	*/

	#include <assert.h>
	#include <limits.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <sys/errno.h>
	#include <time.h>
	#include <string.h>

	#include <xmmintrin.h>

	#define ABS__(x) ((x) < 0 ? -(x) : (x))
	#define MAX__(x,y) ((x) < (y) ? (y) : (x))
	#define FUNC_ENTRY__(name) { name, #name }
	#define ALIGN__ (16)

	typedef float mat_elem_t;

	const size_t ALIGNMENT = sizeof(__m128); // ALIGN__;
	const size_t SHIFT_TO_ALIGN = 0;

	const unsigned int NUM = 10;

	void matrix_free(mat_elem_t *(mat_ptr[]), const size_t dim)
	{
	size_t i;
	for (i = 0; i < dim; i++) {
	free(mat_ptr[i]);
	}
	}

	void matrix_set_NULL(mat_elem_t *(mat_ptr[]), const size_t dim)
	{
	size_t i;
	for (i = 0; i < dim; i++) {
	mat_ptr[i] = NULL;
	}
	}

	void setzero(mat_elem_t *(M[]), const size_t num)
	{
	size_t i, j;
	for(i = 0; i < num; i++) {
	for(j = 0; j < num; j++) {
	M[i][j] = 0;
	}
	}
	}

	void vector_set_zero(mat_elem_t *vec, const size_t dim)
	{
	size_t i;
	for (i = 0; i < dim; i++) {
	vec[i] = 0;
	}
	}

	size_t num_of_aligned_row_elem(const size_t dim)
	{
	const size_t n = ALIGNMENT / sizeof(mat_elem_t);
	const size_t q = dim / n;
	const size_t r = dim % n;
	return r == 0 ? q * n : (q + 1) * n;
	}

	int matrix_allocate(mat_elem_t ((mat_ptr)[]), const size_t dim)
	{
	size_t i;
	for (i = 0; i < dim; i++) {
	const size_t row_size = num_of_aligned_row_elem(dim);
	mat_elem_t *row_ptr;
	int alloc_ret_code = 0;

	alloc_ret_code = posix_memalign((void *)(&row_ptr), ALIGNMENT, row_size sizeof(mat_elem_t));
	switch (alloc_ret_code) {
	case EINVAL:
	case ENOMEM:
	return alloc_ret_code;
	case 0:
	break;
	default:
	return -1;
	}
	vector_set_zero(row_ptr, row_size);
	(*mat_ptr)[i] = row_ptr;
	}
	return 0;
	}

	// initialize matrix
	void init(mat_elem_t (arr[]), const char *filename, int num){
	int i, j;
	char *pt;
	FILE *fp;
	char buf[num*10];

	if (filename == NULL) fp = NULL;
	else fp = fopen(filename, "r");

	for (i = 0; i < num; i++) {
	if (fp == NULL) continue;

	fgets(buf, sizeof(buf), fp);
	arr[i][0] = atof(strtok(buf, " "));
	for (j = 1; ( pt = strtok(NULL, " ") ) != NULL; j++) {
	arr[i][j] = atof(pt);
	}
	}
	}

	float float_diff_normalized(const float a, const float b)
	{
	float c = ABS__(a);
	float d = ABS__(b);
	d = MAX__(c, d);
	return d == 0.0 ? 0.0 : ABS__(a - b) / d;
	}

	int is_eq(const mat_elem_t (L[restrict]), const mat_elem_t (R[restrict]), const size_t num)
	{
	size_t i, j;
	float d;
	for (i = 0; i < num; i++) {
	for (j = 0; j < num; j++) {
	d = float_diff_normalized(L[i][j], R[i][j]);
	if (! (d <= __FLT_EPSILON__)) {
	printf("difference found at (%lu, %lu)\n", i, j);
	printf("diff is %f\n", d);
	return 0;
	}
	}
	}
	return 1;
	}

	void matprint(const mat_elem_t *(M[]), const size_t num)
	{
	size_t i, j;
	for (i = 0; i < num; i++) {
	for (j = 0; j < num; j++) {
	printf("%f ", M[i][j]);
	}
	printf("\n");
	}
	}

	void matmul_orig(mat_elem_t (M[]), const mat_elem_t (A[]), const mat_elem_t *(B[]), const size_t num)
	{
	size_t i, j, k;
	for (i = 0; i < num; i++){
	for (j = 0; j < num; j++){
	for (k = 0; k < num; k++){
	M[i][j] += A[i][k] * B[k][j];
	}
	}
	}
	}

	void matmul_0(mat_elem_t (M[]), const mat_elem_t (A[]), const mat_elem_t *(B[]), const size_t num)
	{
	mat_elem_t t;
	size_t i, j, k;
	for (j = 0; j < num; j++){
	for (i = 0; i < num; i++){
	t = A[i][j];
	for (k = 0; k < num; k++){
	M[i][k] += t * B[j][k];
	}
	}
	}
	}

	void matmul_1(mat_elem_t (M[restrict]), const mat_elem_t (A[restrict]), const mat_elem_t *(B[restrict]), const size_t num)
	{
	mat_elem_t t;
	const mat_elem_t *B_row;
	mat_elem_t *M_row;
	size_t i, j, k;
	for (j = 0; j < num; j++){
	B_row = B[j];
	for (i = 0; i < num; i++){
	t = A[i][j];
	M_row = M[i];
	for (k = 0; k < num; k++){
	M_row[k] += t * B_row[k];
	}
	}
	}
	}

	void matmul_2(mat_elem_t (M[restrict]), const mat_elem_t (A[restrict]), const mat_elem_t *(B[restrict]), const size_t dim)
	{
	__m128 tvec, tm;
	const __m128 *B_row;
	__m128 *M_row;
	size_t i, j, k;
	const size_t aligned_dim = (dim + 3) >> 2;

	for (j = 0; j < dim; j++){
	B_row = (const __m128 *)B[j];
	for (i = 0; i < dim; i++){
	tvec = _mm_set1_ps(A[i][j]);
	M_row = (__m128 *)M[i];
	for (k = 0; k < aligned_dim; k++){
	tm = _mm_mul_ps(tvec, B_row[k]);
	tm = _mm_add_ps(M_row[k], tm);
	_mm_stream_ps((float *)(M_row + k), tm);
	}
	}
	}
	}

	/**/

	typedef void (matmat_func_t)(mat_elem_t ([]), const mat_elem_t ([]), const mat_elem_t ([]), const size_t);

	void initialize_test_data(mat_elem_t ((A)[]), mat_elem_t ((B)[]), mat_elem_t ((M0)[]), mat_elem_t ((M1)[]), const size_t dim)
	{
	int alloc_ret_code;

	alloc_ret_code = matrix_allocate(A, dim);
	if (alloc_ret_code != 0) {
	fprintf(stderr, "%s:%d: error: return code = %d\n",
	__FILE__, __LINE__, alloc_ret_code);
	}
	alloc_ret_code = matrix_allocate(B, dim);
	if (alloc_ret_code != 0) {
	fprintf(stderr, "%s:%d: error: return code = %d\n",
	__FILE__, __LINE__, alloc_ret_code);
	}
	alloc_ret_code = matrix_allocate(M0, dim);
	if (alloc_ret_code != 0) {
	fprintf(stderr, "%s:%d: error: return code = %d\n",
	__FILE__, __LINE__, alloc_ret_code);
	}
	alloc_ret_code = matrix_allocate(M1, dim);
	if (alloc_ret_code != 0) {
	fprintf(stderr, "%s:%d: error: return code = %d\n",
	__FILE__, __LINE__, alloc_ret_code);
	}

	init(*A, "A.dat", dim);
	init(*B, "B.dat", dim);

	{
	size_t i;
	for (i = 0; i < dim; i++) {
	assert( (((uintptr_t)((*A)[i])) % ALIGNMENT ) == 0 );
	assert( (((uintptr_t)((*B)[i])) % ALIGNMENT ) == 0 );
	assert( (((uintptr_t)((*M0)[i])) % ALIGNMENT ) == 0 );
	assert( (((uintptr_t)((*M1)[i])) % ALIGNMENT ) == 0 );
	}
	}
	}

	void benchmark()
	{
	time_t start, end;
	double diff;
	mat_elem_t (A[NUM]), (B[NUM]);
	mat_elem_t (M[NUM]), (M_orig[NUM]);
	struct {
	matmat_func_t f_;
	const char *name_;
	} func_tbl[] = {
	FUNC_ENTRY__(matmul_orig),
	FUNC_ENTRY__(matmul_0),
	FUNC_ENTRY__(matmul_1),
	FUNC_ENTRY__(matmul_2),
	};
	size_t i, flen;

	//

	initialize_test_data(&A, &B, &M_orig, &M, NUM);

	setzero(M_orig, NUM);
	func_tbl[0].f_(M_orig, A, B, NUM);

	flen = sizeof(func_tbl)/sizeof(*func_tbl);
	for (i = 0; i < flen; i++) {
	setzero(M, NUM);

	time (&start);
	func_tbl[i].f_(M, A, B, NUM);
	time (&end);

	diff = difftime(end,start);
	printf("%s : %.2f\n", func_tbl[i].name_, diff);
	if (! is_eq(M, M_orig, NUM)) {
	printf("different\n");
	}
	}
	}

	void check()
	{
	#ifdef NDEBUG
	puts("NDEBUG macro is enabled");
	#else
	puts("NDEBUG macro is disabled");
	#endif
	}

	int main()
	{
	check();

	printf("dimension = %u\n", NUM);

	benchmark();
	return 0;
	}