Created
June 24, 2013 19:07
-
-
Save nazarov-yuriy/5852592 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define BLOCK_NUMBER 32 | |
#define BLOCK_SIZE 256 | |
#define EPS 1e-8 | |
//MAX_ITERS - максимальное число итераций, выполняемое за один запуск ядра | |
#define MAX_ITERS 200*BLOCK_NUMBER | |
#define MAX_DEGREE 8 | |
#define BENCHMARK | |
#include <cuda_runtime.h> | |
#include <sys/time.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <curand_kernel.h> | |
unsigned long get_current_time_in_us() { | |
struct timeval tv; | |
gettimeofday(&tv, 0); | |
return (tv.tv_sec * 1000000l + tv.tv_usec); | |
} | |
#define CUDA_CALL( err )\ | |
do{\ | |
if (err != cudaSuccess) {\ | |
printf("Error: %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);\ | |
exit(EXIT_FAILURE);\ | |
}\ | |
}while(0) | |
const double PI = 3.14159265359; | |
typedef struct Result { | |
int inliners; // число инлайнеров | |
double* coef; // коэффициенты модели | |
int iters; // проведённое число итераций | |
} Result; | |
typedef struct Params { | |
int polyDegree; // масимальная степень искомого многочлена | |
int maxTrial; // максимальное число итераций | |
int maxMPts; // число точек, выбираемое для оценки | |
size_t size; // размер исходной выборки | |
double tolerance; // пороговое значение | |
double outliners; | |
} Params; | |
__global__ void solveRLS(double*, double*, Params*, double*, double*, double*, | |
int*, double*, double*, curandState*, int); | |
Result polyransac(double*, double*, Params); | |
int main(int, char**); | |
void forChart(Params); | |
void massTest(Params); | |
// | |
// выполнение заданного числа итераций алгоритма RANSAC на устройстве | |
// x, y - исходная выборка (x,y), по которым строится модель | |
// par - параметры решения (максимальная степень и т.п.) | |
// | |
// указатели на общий буфер (каждый блок использует свой кусок внутри буфера): | |
// | |
// Vs - для хранения матриц (одна на блок) системы | |
// bs - для правых частей системы | |
// coefs - для "лучших" коэффициентов, полученных в блоке | |
// in_liners - для количества инлайнеров в лучшей модели (в рамках блока) | |
// hs_vs - для разложения Хаусхолдера | |
// hs_ws - для разложения Хаусхолдера | |
// rand_states - переменные для генерации случайных чисел (в каждом треде своя) | |
// | |
// iters - "заказанное" число итераций, для выполнения на устройстве | |
// | |
__global__ void solveRLS( | |
double* x, | |
double* y, | |
Params par, | |
double* Vs, | |
double* bs, | |
double* coefs, | |
int* out_inliners, | |
double* hs_vs, | |
double* hs_ws, | |
int iters, | |
int seed) { | |
__shared__ int inliners[BLOCK_SIZE]; | |
__shared__ int max_pts; | |
__shared__ int size; | |
__shared__ int degree; | |
__shared__ double tolerance; | |
__shared__ int* p_inliners; | |
__shared__ double* p_coefs; | |
__shared__ double* V; | |
__shared__ double* b; | |
__shared__ double* hs_v; | |
__shared__ double* hs_w; | |
__shared__ int trialCount; | |
__shared__ double pNoOutliners; | |
__shared__ int i; | |
__shared__ int N; | |
__shared__ double coeff[MAX_DEGREE]; | |
int k, current_index, tid; | |
double current_x, current_y, temp; | |
int TID = threadIdx.x; | |
int UID = blockIdx.x; | |
int WORKERS = gridDim.x; | |
int T_WORKERS = blockDim.x; | |
if (!TID) { | |
max_pts = par.maxMPts; | |
size = par.size; | |
degree = par.polyDegree; | |
tolerance = par.tolerance; | |
p_inliners = out_inliners + UID; | |
*p_inliners = -1; | |
p_coefs = coefs + (degree + 1) * UID; | |
V = Vs + (degree + 1) * max_pts * UID; | |
b = bs + (max_pts) * UID; | |
hs_v = hs_vs + max_pts * UID; | |
hs_w = hs_ws + (degree + 1) * UID; | |
N = (iters + WORKERS - 1) / WORKERS; | |
trialCount = 0; | |
} | |
curandState local_state; | |
curand_init( | |
seed, | |
TID + UID * T_WORKERS, | |
0, | |
&local_state | |
); | |
//begin ransac | |
while (N > trialCount) { | |
__syncthreads(); | |
tid = TID; | |
while (tid < max_pts) { | |
V[tid] = 1.0; | |
tid += T_WORKERS; | |
} | |
__syncthreads(); | |
if (!TID) { | |
//обеспечиваем полный ранг | |
for (i = 0; i <= degree;) { | |
current_index = curand(&local_state) % size; | |
current_x = x[current_index]; | |
k = 0; | |
while (k < i && V[max_pts + k] != current_x) | |
k++; | |
if (k != i) | |
continue; | |
V[i + max_pts] = current_x; | |
b[i] = y[current_index]; | |
i++; | |
} | |
} | |
__syncthreads(); | |
tid = TID + i; | |
while (tid < max_pts) { | |
current_index = curand(&local_state) % size; | |
current_x = x[current_index]; | |
V[tid + max_pts] = current_x; | |
b[tid] = y[current_index]; | |
tid += T_WORKERS; | |
} | |
__syncthreads(); | |
tid = TID; | |
while (tid < max_pts) { | |
k = 2; | |
current_x = V[tid + max_pts]; | |
temp = current_x; | |
while (k <= degree) { | |
temp *= current_x; | |
V[tid + k * max_pts] = temp; | |
k++; | |
} | |
tid += T_WORKERS; | |
} | |
if (!TID) { | |
i = 0; | |
} | |
__syncthreads(); | |
// решаем переопределённую систему | |
// matrix by column-major a(i,j) = A(i+j*rows) | |
while (i < degree + 1) { | |
// вычисление очередного вектора Хаусхолдера | |
if (!TID) { | |
pNoOutliners = 0.0; | |
for (k = i; k < max_pts; k++) | |
pNoOutliners += V[i * max_pts + k] * V[i * max_pts + k]; | |
if (pNoOutliners > 1e-10) { | |
pNoOutliners = sqrt(pNoOutliners); | |
if (abs(V[i + i * max_pts]) < 1e-10) | |
k = 0; | |
else | |
k = V[i + i * max_pts] > 0 ? 1 : -1; | |
pNoOutliners = V[i + i * max_pts] + k * pNoOutliners; | |
} else { | |
pNoOutliners = 1.0; | |
} | |
hs_v[i] = 1.0; | |
} | |
__syncthreads(); | |
tid = TID + i + 1; | |
while (tid < max_pts) { | |
hs_v[tid] = V[i * max_pts + tid] / pNoOutliners; | |
tid += T_WORKERS; | |
} | |
__syncthreads(); | |
// модификация матрицы | |
if (!TID) { | |
pNoOutliners = 0.0; | |
for (k = i; k < max_pts; k++) | |
pNoOutliners += hs_v[k] * hs_v[k]; | |
pNoOutliners = -2 / pNoOutliners; | |
} | |
__syncthreads(); | |
tid = TID + i; | |
while (tid < degree + 1) { | |
current_x = 0.0; | |
for (k = i; k < max_pts; k++) | |
current_x += V[k + tid * max_pts] * hs_v[k]; | |
hs_w[tid] = current_x * pNoOutliners; | |
tid += T_WORKERS; | |
} | |
__syncthreads(); | |
tid = TID + i; | |
while (tid < max_pts) { | |
current_x = hs_v[tid]; | |
for (k = i; k < degree + 1; k++) | |
V[tid + k * max_pts] += current_x * hs_w[k]; | |
tid += T_WORKERS; | |
} | |
__syncthreads(); | |
// запись нетривиальной части вектора Хаусхолдера | |
tid = TID + i + 1; | |
while (tid < max_pts) { | |
V[tid + i * max_pts] = hs_v[tid]; | |
tid += T_WORKERS; | |
} | |
__syncthreads(); | |
// модификация правой части b->Q'b | |
tid = TID + i; | |
while (tid < max_pts) { | |
current_x = b[i]; | |
for (k = i + 1; k < max_pts; k++) | |
current_x += b[k] * V[k + i * max_pts]; | |
hs_v[tid] = current_x * pNoOutliners; | |
tid += T_WORKERS; | |
} | |
__syncthreads(); | |
tid = TID + i + 1; | |
while (tid < max_pts) { | |
b[tid] += V[tid + i * max_pts] * hs_v[tid]; | |
tid += T_WORKERS; | |
} | |
__syncthreads(); | |
if (!TID) { | |
b[i] += hs_v[i]; | |
i++; | |
} | |
__syncthreads(); | |
} | |
__syncthreads(); | |
if (!TID) | |
i = degree; | |
__syncthreads(); | |
// решение верхнетреугольной системы Rx=Q'b | |
while (i > 0) { | |
if (!TID) { | |
b[i] /= V[i + i * max_pts]; | |
pNoOutliners = b[i]; | |
} | |
if (TID < i) | |
b[TID] -= V[TID + i * max_pts] * pNoOutliners; | |
if (!TID) | |
i--; | |
__syncthreads(); | |
} | |
if (!TID) | |
b[0] /= V[0]; | |
__syncthreads(); | |
if (TID <= degree) { | |
coeff[TID] = b[TID]; | |
if (fabs(coeff[TID]) < EPS) | |
coeff[TID] = 0.0; | |
} | |
__syncthreads(); | |
inliners[TID] = 0; | |
tid = TID; | |
while (tid < size) { | |
current_x = x[tid]; | |
current_y = y[tid]; | |
temp = 0.0; | |
for (k = degree; k >= 0; k--) | |
//temp = temp*current_x+coeff[k]; | |
temp = fma(temp, current_x, coeff[k]); | |
if (abs(temp - current_y) < tolerance) | |
inliners[TID]++; | |
tid += T_WORKERS; | |
} | |
__syncthreads(); | |
if (!TID) { | |
for (k = 1; k < T_WORKERS; k++) | |
inliners[0] += inliners[k]; | |
k = inliners[0]; | |
if (k > *p_inliners) { | |
*p_inliners = k; | |
for (i = 0; i <= degree; i++) | |
p_coefs[i] = coeff[i]; | |
} | |
trialCount++; | |
} | |
__syncthreads(); | |
} | |
} | |
Result polyransac(double* h_x, double* h_y, Params par) { | |
const double pr = log(0.001); | |
Result h_res; | |
double* h_coefs = (double*) malloc( BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ) ); | |
int* h_results = (int*) malloc( BLOCK_NUMBER*sizeof( int ) ); | |
double* d_x; | |
double* d_y; | |
double* d_Vs; | |
double* d_bs; | |
double* d_coefs; | |
double* d_hs_vs; | |
double* d_hs_ws; | |
int* d_results; | |
CUDA_CALL(cudaMalloc((void ** )&d_x, par.size * sizeof(double))); | |
CUDA_CALL(cudaMalloc((void ** )&d_y, par.size * sizeof(double))); | |
CUDA_CALL(cudaMalloc( (void **)&d_Vs, BLOCK_NUMBER*(par.polyDegree+1)*par.maxMPts*sizeof( double ) )); | |
CUDA_CALL(cudaMalloc( (void **)&d_bs, BLOCK_NUMBER*par.maxMPts*sizeof( double ) )); | |
CUDA_CALL(cudaMalloc( (void **)&d_coefs, BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ) )); | |
CUDA_CALL(cudaMalloc( (void **)&d_results, BLOCK_NUMBER*sizeof( int ) )); | |
CUDA_CALL(cudaMalloc( (void **)&d_hs_vs, BLOCK_NUMBER*par.maxMPts*sizeof( double ) )); | |
CUDA_CALL(cudaMalloc( (void **)&d_hs_ws, BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ) )); | |
CUDA_CALL(cudaMemcpy(d_x, h_x, par.size * sizeof(double), cudaMemcpyHostToDevice)); | |
CUDA_CALL(cudaMemcpy(d_y, h_y, par.size * sizeof(double), cudaMemcpyHostToDevice)); | |
//Launch algo | |
int counter = 0; | |
#ifdef BENCHMARK | |
int needed = par.maxTrial; | |
#else | |
int needed = BLOCK_NUMBER; | |
#endif | |
h_res.inliners = -1; | |
h_res.coef = (double*) malloc((par.polyDegree + 1) * sizeof(double)); | |
double delta; | |
int iters; | |
while (counter < needed) { | |
iters = needed - counter; | |
if (iters > MAX_ITERS) | |
iters = MAX_ITERS; | |
solveRLS<<<BLOCK_NUMBER, BLOCK_SIZE, (par.polyDegree+1)*sizeof( double )>>>( | |
d_x, | |
d_y, | |
par, | |
d_Vs, | |
d_bs, | |
d_coefs, | |
d_results, | |
d_hs_vs, | |
d_hs_ws, | |
iters, | |
rand() | |
); | |
counter += ((iters + BLOCK_NUMBER - 1) / BLOCK_NUMBER) * BLOCK_NUMBER; | |
CUDA_CALL(cudaMemcpy(h_results, d_results, BLOCK_NUMBER*sizeof( int ), cudaMemcpyDeviceToHost)); | |
CUDA_CALL(cudaMemcpy(h_coefs, d_coefs, BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ), cudaMemcpyDeviceToHost)); | |
for(int i = 0; i<BLOCK_NUMBER; i++){ | |
if (h_results[i] > h_res.inliners) { | |
for(int j=0;j<par.polyDegree + 1;j++) | |
h_res.coef[j] = h_coefs[i*(par.polyDegree+1) + j]; | |
h_res.inliners = h_results[i]; | |
delta = 1.0 - pow(1.0 * h_res.inliners / par.size, par.maxMPts); | |
delta = min(delta, 1 - 1e-7); | |
delta = max(delta, 1e-7); | |
#ifdef BENCHMARK | |
continue; | |
#else | |
needed = pr/log(delta); | |
needed = min(needed, par.maxTrial); | |
#endif | |
} | |
} | |
} | |
h_res.iters = counter; | |
//Free resources | |
cudaFree(d_x); | |
cudaFree(d_y); | |
cudaFree(d_Vs); | |
cudaFree(d_bs); | |
cudaFree(d_hs_vs); | |
cudaFree(d_hs_ws); | |
cudaFree(d_coefs); | |
cudaFree(d_results); | |
return h_res; | |
} | |
int main(int argc, char** argv) { | |
Params par; | |
#ifdef BENCHMARK | |
par.size = 0; //will be filled in function | |
par.maxMPts = 10; | |
par.maxTrial = 100000; | |
par.polyDegree = 2; | |
par.tolerance = 0.1; | |
par.outliners = 0.3; | |
massTest(par); | |
#else | |
par.size = 10000; | |
par.maxMPts = 50; | |
par.maxTrial = 100000; | |
par.polyDegree = 5; | |
par.tolerance = 0.02; | |
par.outliners = 0.05; | |
forChart(par); | |
#endif | |
return 0; | |
} | |
void generate_sample_with_Gaussian_noise(double *x, double *y, size_t size){ | |
double u_noise_r = 0.0; | |
double u_noise_fi = 0.0; | |
double n_noise = 0.0; | |
double mul = 0.01; | |
srand(123456); | |
for (int i = 0; i < size; i++) { | |
x[i] = 10.0 * i / size + 5.0; //from 0 to 10.0 | |
y[i] = 20 * x[i] * x[i] + 5; // 2*x*x + 0.5 | |
if (i % 2) | |
y[i] += n_noise; | |
else { | |
u_noise_r = 1.0 * rand() / RAND_MAX; | |
u_noise_fi = 1.0 * rand() / RAND_MAX; | |
n_noise = mul * cos(2 * PI * u_noise_fi) * sqrt(-2 * log(u_noise_r)); | |
y[i] += n_noise; | |
n_noise = mul * sin(2 * PI * u_noise_fi) * sqrt(-2 * log(u_noise_r)); | |
} | |
} | |
} | |
void add_outliners(double *y, size_t size, double outliners){ | |
for (int i = 0; i < outliners * size; i++) | |
y[i] += 1000; | |
} | |
void print_info(Params par, Result *res, unsigned long int ev_time){ | |
printf("\nSample length:\t%ld\n", par.size); | |
printf("Estimated set length:\t%d\n", par.maxMPts); | |
printf("Maximum iterations:\t%d\n", par.maxTrial); | |
printf("Maximum degree of the result polynom:\t%d\n", par.polyDegree); | |
printf("Tolerance threshold:\t%.8f\n", par.tolerance); | |
printf("Maximum part of outliners:\t%.8f\n", par.outliners); | |
printf(" ### Total time:\t%.6f\n\n", 1.0 * ev_time / CLOCKS_PER_SEC); | |
printf("Iterations was:\t\t %d\n", res->iters); | |
printf("\n\nInliners = %d\n", res->inliners); | |
printf("\n\nCoefficients:\n\n"); | |
for (int i = par.polyDegree; i > -1; i--) | |
printf("x^%d:\t%.15f\n", i, res->coef[i]); | |
printf("\n"); | |
} | |
void forChart(Params par) { | |
Result res; | |
res.coef = (double*) malloc((par.polyDegree + 1) * sizeof(double)); | |
double* x = (double*) malloc(sizeof(double) * par.size); | |
double* y = (double*) malloc(sizeof(double) * par.size); | |
generate_sample_with_Gaussian_noise(x, y, par.size); | |
add_outliners(y, par.size, par.outliners); | |
unsigned long int start_time = get_current_time_in_us(); | |
res = polyransac(x, y, par); | |
unsigned long int execution_time = get_current_time_in_us() - start_time; | |
print_info(par, &res, execution_time); | |
free(res.coef); | |
free(x); | |
free(y); | |
} | |
void massTest(Params par) { | |
int sizes[] = { 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000 }; | |
for (int j = 0; j < 9; j++) { | |
Result res; | |
par.size = sizes[j]; | |
res.coef = (double*) malloc((par.polyDegree + 1) * sizeof(double)); | |
double* x = (double*) malloc(sizeof(double) * par.size); | |
double* y = (double*) malloc(sizeof(double) * par.size); | |
generate_sample_with_Gaussian_noise(x, y, par.size); | |
add_outliners(y, par.size, par.outliners); | |
unsigned long int start_time = get_current_time_in_us(); | |
for (int pp = 0; pp < 10; pp++) | |
res = polyransac(x, y, par); | |
unsigned long int execution_time = get_current_time_in_us() - start_time; | |
print_info(par, &res, execution_time); | |
free(res.coef); | |
free(x); | |
free(y); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment