Skip to content

Instantly share code, notes, and snippets.

@nazarov-yuriy
Created June 24, 2013 19:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nazarov-yuriy/5852592 to your computer and use it in GitHub Desktop.
Save nazarov-yuriy/5852592 to your computer and use it in GitHub Desktop.
#define BLOCK_NUMBER 32
#define BLOCK_SIZE 256
#define EPS 1e-8
//MAX_ITERS - максимальное число итераций, выполняемое за один запуск ядра
#define MAX_ITERS 200*BLOCK_NUMBER
#define MAX_DEGREE 8
#define BENCHMARK
#include <cuda_runtime.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include <curand_kernel.h>
unsigned long get_current_time_in_us() {
struct timeval tv;
gettimeofday(&tv, 0);
return (tv.tv_sec * 1000000l + tv.tv_usec);
}
#define CUDA_CALL( err )\
do{\
if (err != cudaSuccess) {\
printf("Error: %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);\
exit(EXIT_FAILURE);\
}\
}while(0)
const double PI = 3.14159265359;
typedef struct Result {
int inliners; // число инлайнеров
double* coef; // коэффициенты модели
int iters; // проведённое число итераций
} Result;
typedef struct Params {
int polyDegree; // масимальная степень искомого многочлена
int maxTrial; // максимальное число итераций
int maxMPts; // число точек, выбираемое для оценки
size_t size; // размер исходной выборки
double tolerance; // пороговое значение
double outliners;
} Params;
__global__ void solveRLS(double*, double*, Params*, double*, double*, double*,
int*, double*, double*, curandState*, int);
Result polyransac(double*, double*, Params);
int main(int, char**);
void forChart(Params);
void massTest(Params);
//
// выполнение заданного числа итераций алгоритма RANSAC на устройстве
// x, y - исходная выборка (x,y), по которым строится модель
// par - параметры решения (максимальная степень и т.п.)
//
// указатели на общий буфер (каждый блок использует свой кусок внутри буфера):
//
// Vs - для хранения матриц (одна на блок) системы
// bs - для правых частей системы
// coefs - для "лучших" коэффициентов, полученных в блоке
// in_liners - для количества инлайнеров в лучшей модели (в рамках блока)
// hs_vs - для разложения Хаусхолдера
// hs_ws - для разложения Хаусхолдера
// rand_states - переменные для генерации случайных чисел (в каждом треде своя)
//
// iters - "заказанное" число итераций, для выполнения на устройстве
//
__global__ void solveRLS(
double* x,
double* y,
Params par,
double* Vs,
double* bs,
double* coefs,
int* out_inliners,
double* hs_vs,
double* hs_ws,
int iters,
int seed) {
__shared__ int inliners[BLOCK_SIZE];
__shared__ int max_pts;
__shared__ int size;
__shared__ int degree;
__shared__ double tolerance;
__shared__ int* p_inliners;
__shared__ double* p_coefs;
__shared__ double* V;
__shared__ double* b;
__shared__ double* hs_v;
__shared__ double* hs_w;
__shared__ int trialCount;
__shared__ double pNoOutliners;
__shared__ int i;
__shared__ int N;
__shared__ double coeff[MAX_DEGREE];
int k, current_index, tid;
double current_x, current_y, temp;
int TID = threadIdx.x;
int UID = blockIdx.x;
int WORKERS = gridDim.x;
int T_WORKERS = blockDim.x;
if (!TID) {
max_pts = par.maxMPts;
size = par.size;
degree = par.polyDegree;
tolerance = par.tolerance;
p_inliners = out_inliners + UID;
*p_inliners = -1;
p_coefs = coefs + (degree + 1) * UID;
V = Vs + (degree + 1) * max_pts * UID;
b = bs + (max_pts) * UID;
hs_v = hs_vs + max_pts * UID;
hs_w = hs_ws + (degree + 1) * UID;
N = (iters + WORKERS - 1) / WORKERS;
trialCount = 0;
}
curandState local_state;
curand_init(
seed,
TID + UID * T_WORKERS,
0,
&local_state
);
//begin ransac
while (N > trialCount) {
__syncthreads();
tid = TID;
while (tid < max_pts) {
V[tid] = 1.0;
tid += T_WORKERS;
}
__syncthreads();
if (!TID) {
//обеспечиваем полный ранг
for (i = 0; i <= degree;) {
current_index = curand(&local_state) % size;
current_x = x[current_index];
k = 0;
while (k < i && V[max_pts + k] != current_x)
k++;
if (k != i)
continue;
V[i + max_pts] = current_x;
b[i] = y[current_index];
i++;
}
}
__syncthreads();
tid = TID + i;
while (tid < max_pts) {
current_index = curand(&local_state) % size;
current_x = x[current_index];
V[tid + max_pts] = current_x;
b[tid] = y[current_index];
tid += T_WORKERS;
}
__syncthreads();
tid = TID;
while (tid < max_pts) {
k = 2;
current_x = V[tid + max_pts];
temp = current_x;
while (k <= degree) {
temp *= current_x;
V[tid + k * max_pts] = temp;
k++;
}
tid += T_WORKERS;
}
if (!TID) {
i = 0;
}
__syncthreads();
// решаем переопределённую систему
// matrix by column-major a(i,j) = A(i+j*rows)
while (i < degree + 1) {
// вычисление очередного вектора Хаусхолдера
if (!TID) {
pNoOutliners = 0.0;
for (k = i; k < max_pts; k++)
pNoOutliners += V[i * max_pts + k] * V[i * max_pts + k];
if (pNoOutliners > 1e-10) {
pNoOutliners = sqrt(pNoOutliners);
if (abs(V[i + i * max_pts]) < 1e-10)
k = 0;
else
k = V[i + i * max_pts] > 0 ? 1 : -1;
pNoOutliners = V[i + i * max_pts] + k * pNoOutliners;
} else {
pNoOutliners = 1.0;
}
hs_v[i] = 1.0;
}
__syncthreads();
tid = TID + i + 1;
while (tid < max_pts) {
hs_v[tid] = V[i * max_pts + tid] / pNoOutliners;
tid += T_WORKERS;
}
__syncthreads();
// модификация матрицы
if (!TID) {
pNoOutliners = 0.0;
for (k = i; k < max_pts; k++)
pNoOutliners += hs_v[k] * hs_v[k];
pNoOutliners = -2 / pNoOutliners;
}
__syncthreads();
tid = TID + i;
while (tid < degree + 1) {
current_x = 0.0;
for (k = i; k < max_pts; k++)
current_x += V[k + tid * max_pts] * hs_v[k];
hs_w[tid] = current_x * pNoOutliners;
tid += T_WORKERS;
}
__syncthreads();
tid = TID + i;
while (tid < max_pts) {
current_x = hs_v[tid];
for (k = i; k < degree + 1; k++)
V[tid + k * max_pts] += current_x * hs_w[k];
tid += T_WORKERS;
}
__syncthreads();
// запись нетривиальной части вектора Хаусхолдера
tid = TID + i + 1;
while (tid < max_pts) {
V[tid + i * max_pts] = hs_v[tid];
tid += T_WORKERS;
}
__syncthreads();
// модификация правой части b->Q'b
tid = TID + i;
while (tid < max_pts) {
current_x = b[i];
for (k = i + 1; k < max_pts; k++)
current_x += b[k] * V[k + i * max_pts];
hs_v[tid] = current_x * pNoOutliners;
tid += T_WORKERS;
}
__syncthreads();
tid = TID + i + 1;
while (tid < max_pts) {
b[tid] += V[tid + i * max_pts] * hs_v[tid];
tid += T_WORKERS;
}
__syncthreads();
if (!TID) {
b[i] += hs_v[i];
i++;
}
__syncthreads();
}
__syncthreads();
if (!TID)
i = degree;
__syncthreads();
// решение верхнетреугольной системы Rx=Q'b
while (i > 0) {
if (!TID) {
b[i] /= V[i + i * max_pts];
pNoOutliners = b[i];
}
if (TID < i)
b[TID] -= V[TID + i * max_pts] * pNoOutliners;
if (!TID)
i--;
__syncthreads();
}
if (!TID)
b[0] /= V[0];
__syncthreads();
if (TID <= degree) {
coeff[TID] = b[TID];
if (fabs(coeff[TID]) < EPS)
coeff[TID] = 0.0;
}
__syncthreads();
inliners[TID] = 0;
tid = TID;
while (tid < size) {
current_x = x[tid];
current_y = y[tid];
temp = 0.0;
for (k = degree; k >= 0; k--)
//temp = temp*current_x+coeff[k];
temp = fma(temp, current_x, coeff[k]);
if (abs(temp - current_y) < tolerance)
inliners[TID]++;
tid += T_WORKERS;
}
__syncthreads();
if (!TID) {
for (k = 1; k < T_WORKERS; k++)
inliners[0] += inliners[k];
k = inliners[0];
if (k > *p_inliners) {
*p_inliners = k;
for (i = 0; i <= degree; i++)
p_coefs[i] = coeff[i];
}
trialCount++;
}
__syncthreads();
}
}
Result polyransac(double* h_x, double* h_y, Params par) {
const double pr = log(0.001);
Result h_res;
double* h_coefs = (double*) malloc( BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ) );
int* h_results = (int*) malloc( BLOCK_NUMBER*sizeof( int ) );
double* d_x;
double* d_y;
double* d_Vs;
double* d_bs;
double* d_coefs;
double* d_hs_vs;
double* d_hs_ws;
int* d_results;
CUDA_CALL(cudaMalloc((void ** )&d_x, par.size * sizeof(double)));
CUDA_CALL(cudaMalloc((void ** )&d_y, par.size * sizeof(double)));
CUDA_CALL(cudaMalloc( (void **)&d_Vs, BLOCK_NUMBER*(par.polyDegree+1)*par.maxMPts*sizeof( double ) ));
CUDA_CALL(cudaMalloc( (void **)&d_bs, BLOCK_NUMBER*par.maxMPts*sizeof( double ) ));
CUDA_CALL(cudaMalloc( (void **)&d_coefs, BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ) ));
CUDA_CALL(cudaMalloc( (void **)&d_results, BLOCK_NUMBER*sizeof( int ) ));
CUDA_CALL(cudaMalloc( (void **)&d_hs_vs, BLOCK_NUMBER*par.maxMPts*sizeof( double ) ));
CUDA_CALL(cudaMalloc( (void **)&d_hs_ws, BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ) ));
CUDA_CALL(cudaMemcpy(d_x, h_x, par.size * sizeof(double), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(d_y, h_y, par.size * sizeof(double), cudaMemcpyHostToDevice));
//Launch algo
int counter = 0;
#ifdef BENCHMARK
int needed = par.maxTrial;
#else
int needed = BLOCK_NUMBER;
#endif
h_res.inliners = -1;
h_res.coef = (double*) malloc((par.polyDegree + 1) * sizeof(double));
double delta;
int iters;
while (counter < needed) {
iters = needed - counter;
if (iters > MAX_ITERS)
iters = MAX_ITERS;
solveRLS<<<BLOCK_NUMBER, BLOCK_SIZE, (par.polyDegree+1)*sizeof( double )>>>(
d_x,
d_y,
par,
d_Vs,
d_bs,
d_coefs,
d_results,
d_hs_vs,
d_hs_ws,
iters,
rand()
);
counter += ((iters + BLOCK_NUMBER - 1) / BLOCK_NUMBER) * BLOCK_NUMBER;
CUDA_CALL(cudaMemcpy(h_results, d_results, BLOCK_NUMBER*sizeof( int ), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(h_coefs, d_coefs, BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ), cudaMemcpyDeviceToHost));
for(int i = 0; i<BLOCK_NUMBER; i++){
if (h_results[i] > h_res.inliners) {
for(int j=0;j<par.polyDegree + 1;j++)
h_res.coef[j] = h_coefs[i*(par.polyDegree+1) + j];
h_res.inliners = h_results[i];
delta = 1.0 - pow(1.0 * h_res.inliners / par.size, par.maxMPts);
delta = min(delta, 1 - 1e-7);
delta = max(delta, 1e-7);
#ifdef BENCHMARK
continue;
#else
needed = pr/log(delta);
needed = min(needed, par.maxTrial);
#endif
}
}
}
h_res.iters = counter;
//Free resources
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_Vs);
cudaFree(d_bs);
cudaFree(d_hs_vs);
cudaFree(d_hs_ws);
cudaFree(d_coefs);
cudaFree(d_results);
return h_res;
}
int main(int argc, char** argv) {
Params par;
#ifdef BENCHMARK
par.size = 0; //will be filled in function
par.maxMPts = 10;
par.maxTrial = 100000;
par.polyDegree = 2;
par.tolerance = 0.1;
par.outliners = 0.3;
massTest(par);
#else
par.size = 10000;
par.maxMPts = 50;
par.maxTrial = 100000;
par.polyDegree = 5;
par.tolerance = 0.02;
par.outliners = 0.05;
forChart(par);
#endif
return 0;
}
void generate_sample_with_Gaussian_noise(double *x, double *y, size_t size){
double u_noise_r = 0.0;
double u_noise_fi = 0.0;
double n_noise = 0.0;
double mul = 0.01;
srand(123456);
for (int i = 0; i < size; i++) {
x[i] = 10.0 * i / size + 5.0; //from 0 to 10.0
y[i] = 20 * x[i] * x[i] + 5; // 2*x*x + 0.5
if (i % 2)
y[i] += n_noise;
else {
u_noise_r = 1.0 * rand() / RAND_MAX;
u_noise_fi = 1.0 * rand() / RAND_MAX;
n_noise = mul * cos(2 * PI * u_noise_fi) * sqrt(-2 * log(u_noise_r));
y[i] += n_noise;
n_noise = mul * sin(2 * PI * u_noise_fi) * sqrt(-2 * log(u_noise_r));
}
}
}
void add_outliners(double *y, size_t size, double outliners){
for (int i = 0; i < outliners * size; i++)
y[i] += 1000;
}
void print_info(Params par, Result *res, unsigned long int ev_time){
printf("\nSample length:\t%ld\n", par.size);
printf("Estimated set length:\t%d\n", par.maxMPts);
printf("Maximum iterations:\t%d\n", par.maxTrial);
printf("Maximum degree of the result polynom:\t%d\n", par.polyDegree);
printf("Tolerance threshold:\t%.8f\n", par.tolerance);
printf("Maximum part of outliners:\t%.8f\n", par.outliners);
printf(" ### Total time:\t%.6f\n\n", 1.0 * ev_time / CLOCKS_PER_SEC);
printf("Iterations was:\t\t %d\n", res->iters);
printf("\n\nInliners = %d\n", res->inliners);
printf("\n\nCoefficients:\n\n");
for (int i = par.polyDegree; i > -1; i--)
printf("x^%d:\t%.15f\n", i, res->coef[i]);
printf("\n");
}
void forChart(Params par) {
Result res;
res.coef = (double*) malloc((par.polyDegree + 1) * sizeof(double));
double* x = (double*) malloc(sizeof(double) * par.size);
double* y = (double*) malloc(sizeof(double) * par.size);
generate_sample_with_Gaussian_noise(x, y, par.size);
add_outliners(y, par.size, par.outliners);
unsigned long int start_time = get_current_time_in_us();
res = polyransac(x, y, par);
unsigned long int execution_time = get_current_time_in_us() - start_time;
print_info(par, &res, execution_time);
free(res.coef);
free(x);
free(y);
}
void massTest(Params par) {
int sizes[] = { 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000 };
for (int j = 0; j < 9; j++) {
Result res;
par.size = sizes[j];
res.coef = (double*) malloc((par.polyDegree + 1) * sizeof(double));
double* x = (double*) malloc(sizeof(double) * par.size);
double* y = (double*) malloc(sizeof(double) * par.size);
generate_sample_with_Gaussian_noise(x, y, par.size);
add_outliners(y, par.size, par.outliners);
unsigned long int start_time = get_current_time_in_us();
for (int pp = 0; pp < 10; pp++)
res = polyransac(x, y, par);
unsigned long int execution_time = get_current_time_in_us() - start_time;
print_info(par, &res, execution_time);
free(res.coef);
free(x);
free(y);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment