Skip to content

Instantly share code, notes, and snippets.

@momotaro98
Created December 24, 2015 08:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save momotaro98/7d4c6f8b71b00d494d26 to your computer and use it in GitHub Desktop.
Save momotaro98/7d4c6f8b71b00d494d26 to your computer and use it in GitHub Desktop.
授業"コンピュータ・アーキテクチャ"の最終課題 K-Means計算をGPUで処理 CUDAで実装
// -*- C++ -*-
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include "kmeans.h"
#define BLOCKSIZE (64) // 汎化のためにもdefineを定義するのはBLOCKSIZEだけにしよう
#ifdef __cplusplus
extern "C"
#endif
__global__ void nearest_point(float* inx, float* iny, int* inlabels, float* inavgx, float* inavgy, int* ineflags, int num_cluster)
{
int tmp;
int i = blockDim.x * blockIdx.x + threadIdx.x;
int k = threadIdx.x;
//check nearest start
int j;
float nearest_distance = DBL_MAX;
float tmp_f;
int ret = 0;
__shared__ float Xs[BLOCKSIZE];
__shared__ float Ys[BLOCKSIZE];
Xs[k] = inx[i];
Ys[k] = iny[i];
for(j = 0; j < num_cluster; j++){
tmp_f = (Xs[k] - inavgx[j]) * (Xs[k] - inavgx[j]) + (Ys[k] - inavgy[j]) * (Ys[k] - inavgy[j]);
if (tmp_f < nearest_distance) {
nearest_distance = tmp_f;
ret = j;
}
}
tmp = ret;
//check nearest end
if(tmp != inlabels[i]){ ineflags[0] = 0; }
inlabels[i] = tmp;
}
extern "C"
void gpu_kmeans(kmeans_t *problem)
{
float *x = problem->x;
float *y = problem->y;
int *labels = problem->labels;
int num_points = problem->num_points;
int num_cluster = problem->num_clusters;
int i;
int loop_count = 0;
float *avgx, *avgy;
float *sumx, *sumy;
int *cnt;
int *eflags;
//CPUMemoryAllocation
avgx = (float *)malloc(num_cluster * sizeof(float));
avgy = (float *)malloc(num_cluster * sizeof(float));
sumx = (float *)malloc(num_cluster * sizeof(float));
sumy = (float *)malloc(num_cluster * sizeof(float));
cnt = (int *)malloc(num_cluster * sizeof(int));
eflags = (int *)malloc(sizeof(int) * num_points);
//Cuda Variable
float* dx;
float* dy;
int* dlabels;
float* davgx;
float* davgy;
int* deflags;
//CudaMemoryAllocation
cudaMalloc((void **)(&dx), sizeof(float) * num_points);
cudaMalloc((void **)(&dy), sizeof(float) * num_points);
cudaMalloc((void **)(&dlabels), sizeof(int) * num_points);
cudaMalloc((void **)(&davgx), sizeof(float) * num_cluster);
cudaMalloc((void **)(&davgy), sizeof(float) * num_cluster);
cudaMalloc((void **)(&deflags), sizeof(int) * num_points);
//CudaMemoryCopyHostToDevice
cudaMemcpy(dx, x, sizeof(float)*num_points, cudaMemcpyHostToDevice);
cudaMemcpy(dy, y, sizeof(float)*num_points, cudaMemcpyHostToDevice);
cudaMemcpy(dlabels, labels, sizeof(int)*num_points, cudaMemcpyHostToDevice);
dim3 blockNum(num_points/BLOCKSIZE, 1, 1);
dim3 threadNum(BLOCKSIZE, 1, 1);
for (i = 0; i < num_points; i++) {
eflags[i] = 0;
}
// [1]: average point of each cluster start
do {
eflags[0] = 1;
for (i = 0; i < num_cluster; i++) {
sumx[i] = 0.0;
sumy[i] = 0.0;
cnt[i] = 0;
}
for (i = 0; i < num_points; i++) {
sumx[labels[i]] += x[i];
sumy[labels[i]] += y[i];
cnt[labels[i]]++;
}
for (i = 0; i < num_cluster; i++) {
if (cnt[i] != 0) {
avgx[i] = sumx[i] / cnt[i];
avgy[i] = sumy[i] / cnt[i];
}
}
//CudaMemoryCopyHostToDevice
cudaMemcpy(davgx, avgx, sizeof(float)*num_cluster, cudaMemcpyHostToDevice);
cudaMemcpy(davgy, avgy, sizeof(float)*num_cluster, cudaMemcpyHostToDevice);
cudaMemcpy(deflags, eflags, sizeof(int)*num_points, cudaMemcpyHostToDevice);
// [1]: average point of each cluster end
// [2]: the nearest average point
// Kernel Program
nearest_point<<<blockNum, threadNum>>>(dx, dy, dlabels, davgx, davgy, deflags, num_cluster);
//Kernel Error Output Start
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) { // cudaSuccess, cudaFail が使える
printf("-------CudaError----------\n");
printf("%s\n", cudaGetErrorString(err));// エラーIDからエラーメッセージ取得
printf("--------------------------\n");
}
//Kernel Error Output End
cudaMemcpy(labels, dlabels, sizeof(int)*num_points, cudaMemcpyDeviceToHost);
cudaMemcpy(eflags, deflags, sizeof(int)*num_points, cudaMemcpyDeviceToHost);
loop_count++;
}while(eflags[0] == 0);
// [3]: [1], [2] continue until no change in [2]
free(sumx);
free(sumy);
free(cnt);
free(avgx);
free(avgy);
free(eflags);
cudaFree(dx);
cudaFree(dy);
cudaFree(dlabels);
cudaFree(davgx);
cudaFree(davgy);
cudaFree(deflags);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment