momotaro98/gpu_kmeans.cu

## gpu_kmeans.cu
// -*- C++ -*-
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include "kmeans.h"

#define BLOCKSIZE (64) // 汎化のためにもdefineを定義するのはBLOCKSIZEだけにしよう

#ifdef __cplusplus
extern "C"
#endif

__global__ void nearest_point(float* inx, float* iny, int* inlabels, float* inavgx, float* inavgy, int* ineflags, int num_cluster)
{
  int tmp;
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  int k = threadIdx.x;
  //check nearest start
  int j;
  float nearest_distance = DBL_MAX;
  float tmp_f;
  int ret = 0;
  __shared__ float Xs[BLOCKSIZE];
  __shared__ float Ys[BLOCKSIZE];
  Xs[k] = inx[i];
  Ys[k] = iny[i];
  for(j = 0; j < num_cluster; j++){
  tmp_f = (Xs[k] - inavgx[j]) * (Xs[k] - inavgx[j]) + (Ys[k] - inavgy[j]) * (Ys[k] - inavgy[j]);
    if (tmp_f < nearest_distance) {
      nearest_distance = tmp_f;
      ret = j;
    }
  }
  tmp = ret;
  //check nearest end
  if(tmp != inlabels[i]){ ineflags[0] = 0; }
  inlabels[i] = tmp;
}

extern "C"
void gpu_kmeans(kmeans_t *problem)
{
  float *x = problem->x;
  float *y = problem->y;
  int *labels = problem->labels;
  int num_points = problem->num_points;
  int num_cluster = problem->num_clusters;
  int i;
  int loop_count = 0;
  float *avgx, *avgy;
  float *sumx, *sumy;
  int *cnt;
  int *eflags;

  //CPUMemoryAllocation
  avgx = (float *)malloc(num_cluster * sizeof(float));
  avgy = (float *)malloc(num_cluster * sizeof(float));
  sumx = (float *)malloc(num_cluster * sizeof(float));
  sumy = (float *)malloc(num_cluster * sizeof(float));
  cnt = (int *)malloc(num_cluster * sizeof(int));
  eflags = (int *)malloc(sizeof(int) * num_points);

  //Cuda Variable
  float* dx;
  float* dy;
  int* dlabels;
  float* davgx;
  float* davgy;
  int* deflags;

  //CudaMemoryAllocation
  cudaMalloc((void **)(&dx), sizeof(float) * num_points);
  cudaMalloc((void **)(&dy), sizeof(float) * num_points);
  cudaMalloc((void **)(&dlabels), sizeof(int) * num_points);
  cudaMalloc((void **)(&davgx), sizeof(float) * num_cluster);
  cudaMalloc((void **)(&davgy), sizeof(float) * num_cluster);
  cudaMalloc((void **)(&deflags), sizeof(int) * num_points);

  //CudaMemoryCopyHostToDevice
  cudaMemcpy(dx, x, sizeof(float)*num_points, cudaMemcpyHostToDevice);
  cudaMemcpy(dy, y, sizeof(float)*num_points, cudaMemcpyHostToDevice);
  cudaMemcpy(dlabels, labels, sizeof(int)*num_points, cudaMemcpyHostToDevice);

  dim3 blockNum(num_points/BLOCKSIZE, 1, 1);
  dim3 threadNum(BLOCKSIZE, 1, 1);

  for (i = 0; i < num_points; i++) {
    eflags[i] = 0;
  }

  // [1]: average point of each cluster start
  do {
    eflags[0] = 1;
    for (i = 0; i < num_cluster; i++) {
      sumx[i] = 0.0;
      sumy[i] = 0.0;
      cnt[i] = 0;
    }
    for (i = 0; i < num_points; i++) {
      sumx[labels[i]] += x[i];
      sumy[labels[i]] += y[i];
      cnt[labels[i]]++;
    }
    for (i = 0; i < num_cluster; i++) {
      if (cnt[i] != 0) {
        avgx[i] = sumx[i] / cnt[i];
        avgy[i] = sumy[i] / cnt[i];
      }
    }
    //CudaMemoryCopyHostToDevice
    cudaMemcpy(davgx, avgx, sizeof(float)*num_cluster, cudaMemcpyHostToDevice);
    cudaMemcpy(davgy, avgy, sizeof(float)*num_cluster, cudaMemcpyHostToDevice);
    cudaMemcpy(deflags, eflags, sizeof(int)*num_points, cudaMemcpyHostToDevice);
    // [1]: average point of each cluster end

    // [2]: the nearest average point
    // Kernel Program
    nearest_point<<<blockNum, threadNum>>>(dx, dy, dlabels, davgx, davgy, deflags, num_cluster);
    //Kernel Error Output Start
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) { // cudaSuccess, cudaFail が使える
      printf("-------CudaError----------\n");
      printf("%s\n", cudaGetErrorString(err));// エラーIDからエラーメッセージ取得
      printf("--------------------------\n");
        }
    //Kernel Error Output End

    cudaMemcpy(labels, dlabels, sizeof(int)*num_points, cudaMemcpyDeviceToHost);
    cudaMemcpy(eflags, deflags, sizeof(int)*num_points, cudaMemcpyDeviceToHost);

    loop_count++;
  }while(eflags[0] == 0);
  // [3]: [1], [2] continue until no change in [2]
  free(sumx);
  free(sumy);
  free(cnt);
  free(avgx);
  free(avgy);
  free(eflags);
  cudaFree(dx);
  cudaFree(dy);
  cudaFree(dlabels);
  cudaFree(davgx);
  cudaFree(davgy);
  cudaFree(deflags);
}
	// -- C++ --
	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>
	#include <float.h>
	#include "kmeans.h"

	#define BLOCKSIZE (64) // 汎化のためにもdefineを定義するのはBLOCKSIZEだけにしよう

	#ifdef __cplusplus
	extern "C"
	#endif

	__global__ void nearest_point(float* inx, float* iny, int* inlabels, float* inavgx, float* inavgy, int* ineflags, int num_cluster)
	{
	int tmp;
	int i = blockDim.x * blockIdx.x + threadIdx.x;
	int k = threadIdx.x;
	//check nearest start
	int j;
	float nearest_distance = DBL_MAX;
	float tmp_f;
	int ret = 0;
	__shared__ float Xs[BLOCKSIZE];
	__shared__ float Ys[BLOCKSIZE];
	Xs[k] = inx[i];
	Ys[k] = iny[i];
	for(j = 0; j < num_cluster; j++){
	tmp_f = (Xs[k] - inavgx[j]) * (Xs[k] - inavgx[j]) + (Ys[k] - inavgy[j]) * (Ys[k] - inavgy[j]);
	if (tmp_f < nearest_distance) {
	nearest_distance = tmp_f;
	ret = j;
	}
	}
	tmp = ret;
	//check nearest end
	if(tmp != inlabels[i]){ ineflags[0] = 0; }
	inlabels[i] = tmp;
	}

	extern "C"
	void gpu_kmeans(kmeans_t *problem)
	{
	float *x = problem->x;
	float *y = problem->y;
	int *labels = problem->labels;
	int num_points = problem->num_points;
	int num_cluster = problem->num_clusters;
	int i;
	int loop_count = 0;
	float avgx, avgy;
	float sumx, sumy;
	int *cnt;
	int *eflags;

	//CPUMemoryAllocation
	avgx = (float )malloc(num_cluster sizeof(float));
	avgy = (float )malloc(num_cluster sizeof(float));
	sumx = (float )malloc(num_cluster sizeof(float));
	sumy = (float )malloc(num_cluster sizeof(float));
	cnt = (int )malloc(num_cluster sizeof(int));
	eflags = (int )malloc(sizeof(int) num_points);

	//Cuda Variable
	float* dx;
	float* dy;
	int* dlabels;
	float* davgx;
	float* davgy;
	int* deflags;

	//CudaMemoryAllocation
	cudaMalloc((void *)(&dx), sizeof(float) num_points);
	cudaMalloc((void *)(&dy), sizeof(float) num_points);
	cudaMalloc((void *)(&dlabels), sizeof(int) num_points);
	cudaMalloc((void *)(&davgx), sizeof(float) num_cluster);
	cudaMalloc((void *)(&davgy), sizeof(float) num_cluster);
	cudaMalloc((void *)(&deflags), sizeof(int) num_points);

	//CudaMemoryCopyHostToDevice
	cudaMemcpy(dx, x, sizeof(float)*num_points, cudaMemcpyHostToDevice);
	cudaMemcpy(dy, y, sizeof(float)*num_points, cudaMemcpyHostToDevice);
	cudaMemcpy(dlabels, labels, sizeof(int)*num_points, cudaMemcpyHostToDevice);

	dim3 blockNum(num_points/BLOCKSIZE, 1, 1);
	dim3 threadNum(BLOCKSIZE, 1, 1);

	for (i = 0; i < num_points; i++) {
	eflags[i] = 0;
	}

	// [1]: average point of each cluster start
	do {
	eflags[0] = 1;
	for (i = 0; i < num_cluster; i++) {
	sumx[i] = 0.0;
	sumy[i] = 0.0;
	cnt[i] = 0;
	}
	for (i = 0; i < num_points; i++) {
	sumx[labels[i]] += x[i];
	sumy[labels[i]] += y[i];
	cnt[labels[i]]++;
	}
	for (i = 0; i < num_cluster; i++) {
	if (cnt[i] != 0) {
	avgx[i] = sumx[i] / cnt[i];
	avgy[i] = sumy[i] / cnt[i];
	}
	}
	//CudaMemoryCopyHostToDevice
	cudaMemcpy(davgx, avgx, sizeof(float)*num_cluster, cudaMemcpyHostToDevice);
	cudaMemcpy(davgy, avgy, sizeof(float)*num_cluster, cudaMemcpyHostToDevice);
	cudaMemcpy(deflags, eflags, sizeof(int)*num_points, cudaMemcpyHostToDevice);
	// [1]: average point of each cluster end

	// [2]: the nearest average point
	// Kernel Program
	nearest_point<<<blockNum, threadNum>>>(dx, dy, dlabels, davgx, davgy, deflags, num_cluster);
	//Kernel Error Output Start
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) { // cudaSuccess, cudaFail が使える
	printf("-------CudaError----------\n");
	printf("%s\n", cudaGetErrorString(err));// エラーIDからエラーメッセージ取得
	printf("--------------------------\n");
	}
	//Kernel Error Output End

	cudaMemcpy(labels, dlabels, sizeof(int)*num_points, cudaMemcpyDeviceToHost);
	cudaMemcpy(eflags, deflags, sizeof(int)*num_points, cudaMemcpyDeviceToHost);

	loop_count++;
	}while(eflags[0] == 0);
	// [3]: [1], [2] continue until no change in [2]
	free(sumx);
	free(sumy);
	free(cnt);
	free(avgx);
	free(avgy);
	free(eflags);
	cudaFree(dx);
	cudaFree(dy);
	cudaFree(dlabels);
	cudaFree(davgx);
	cudaFree(davgy);
	cudaFree(deflags);
	}