nazarov-yuriy/kernel.cu

## kernel.cu
#define BLOCK_NUMBER 32
#define BLOCK_SIZE 256
#define EPS 1e-8
//MAX_ITERS - максимальное число итераций, выполняемое за один запуск ядра
#define MAX_ITERS 200*BLOCK_NUMBER
#define MAX_DEGREE 8

#define BENCHMARK

#include <cuda_runtime.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include <curand_kernel.h>

unsigned long get_current_time_in_us() {
	struct timeval tv;
	gettimeofday(&tv, 0);
	return (tv.tv_sec * 1000000l + tv.tv_usec);
}

#define CUDA_CALL( err )\
	do{\
		if (err != cudaSuccess) {\
			printf("Error: %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);\
			exit(EXIT_FAILURE);\
		}\
	}while(0)

const double PI = 3.14159265359;

typedef struct Result {
	int inliners;	// число инлайнеров
	double* coef;	// коэффициенты модели
	int iters;		// проведённое число итераций
} Result;

typedef struct Params {
	int polyDegree;	// масимальная степень искомого многочлена
	int maxTrial;	// максимальное число итераций
	int maxMPts;	// число точек, выбираемое для оценки
	size_t size;	// размер исходной выборки
	double tolerance; // пороговое значение
	double outliners;
} Params;

__global__ void solveRLS(double*, double*, Params*, double*, double*, double*,
		int*, double*, double*, curandState*, int);
Result polyransac(double*, double*, Params);
int main(int, char**);
void forChart(Params);
void massTest(Params);

//
// выполнение заданного числа итераций алгоритма RANSAC на устройстве
// x, y - исходная выборка (x,y), по которым строится модель
// par	- параметры решения (максимальная степень и т.п.)
//
// указатели на общий буфер (каждый блок использует свой кусок внутри буфера):
//
// Vs	- для хранения матриц (одна на блок) системы
// bs	- для правых частей системы
// coefs	- для "лучших" коэффициентов, полученных в блоке
// in_liners - для количества инлайнеров в лучшей модели (в рамках блока)
// hs_vs	- для разложения Хаусхолдера
// hs_ws	- для разложения Хаусхолдера
// rand_states	- переменные для генерации случайных чисел (в каждом треде своя)
//
// iters	- "заказанное" число итераций, для выполнения на устройстве
//
__global__ void solveRLS(
		double* x,
		double* y,
		Params par,
		double* Vs,
		double* bs,
		double* coefs,
		int* out_inliners,
		double* hs_vs,
		double* hs_ws,
		int iters,
		int seed) {

	__shared__ int inliners[BLOCK_SIZE];

	__shared__ int max_pts;
	__shared__ int size;
	__shared__ int degree;
	__shared__ double tolerance;

	__shared__ int* p_inliners;
	__shared__ double* p_coefs;

	__shared__ double* V;
	__shared__ double* b;

	__shared__ double* hs_v;
	__shared__ double* hs_w;

	__shared__ int trialCount;
	__shared__ double pNoOutliners;
	__shared__ int i;

	__shared__ int N;
	__shared__ double coeff[MAX_DEGREE];

	int k, current_index, tid;
	double current_x, current_y, temp;
	int TID = threadIdx.x;
	int UID = blockIdx.x;
	int WORKERS = gridDim.x;
	int T_WORKERS = blockDim.x;

	if (!TID) {
		max_pts = par.maxMPts;
		size = par.size;
		degree = par.polyDegree;
		tolerance = par.tolerance;

		p_inliners = out_inliners + UID;
		*p_inliners = -1;
		p_coefs = coefs + (degree + 1) * UID;

		V = Vs + (degree + 1) * max_pts * UID;
		b = bs + (max_pts) * UID;
		hs_v = hs_vs + max_pts * UID;
		hs_w = hs_ws + (degree + 1) * UID;
		N = (iters + WORKERS - 1) / WORKERS;
		trialCount = 0;
	}

	curandState local_state;
	curand_init(
		seed,
		TID + UID * T_WORKERS,
		0,
		&local_state
	);

	//begin ransac

	while (N > trialCount) {

		__syncthreads();

		tid = TID;
		while (tid < max_pts) {
			V[tid] = 1.0;
			tid += T_WORKERS;
		}

		__syncthreads();

		if (!TID) {
			//обеспечиваем полный ранг
			for (i = 0; i <= degree;) {
				current_index = curand(&local_state) % size;
				current_x = x[current_index];
				k = 0;
				while (k < i && V[max_pts + k] != current_x)
					k++;
				if (k != i)
					continue;
				V[i + max_pts] = current_x;
				b[i] = y[current_index];
				i++;
			}
		}

		__syncthreads();

		tid = TID + i;
		while (tid < max_pts) {
			current_index = curand(&local_state) % size;
			current_x = x[current_index];
			V[tid + max_pts] = current_x;
			b[tid] = y[current_index];
			tid += T_WORKERS;
		}

		__syncthreads();

		tid = TID;
		while (tid < max_pts) {
			k = 2;
			current_x = V[tid + max_pts];
			temp = current_x;
			while (k <= degree) {
				temp *= current_x;
				V[tid + k * max_pts] = temp;
				k++;
			}
			tid += T_WORKERS;
		}

		if (!TID) {
			i = 0;
		}

		__syncthreads();
		// решаем переопределённую систему
		// matrix by column-major a(i,j) = A(i+j*rows)
		while (i < degree + 1) {
			// вычисление очередного вектора Хаусхолдера
			if (!TID) {
				pNoOutliners = 0.0;
				for (k = i; k < max_pts; k++)
					pNoOutliners += V[i * max_pts + k] * V[i * max_pts + k];

				if (pNoOutliners > 1e-10) {
					pNoOutliners = sqrt(pNoOutliners);
					if (abs(V[i + i * max_pts]) < 1e-10)
						k = 0;
					else
						k = V[i + i * max_pts] > 0 ? 1 : -1;
					pNoOutliners = V[i + i * max_pts] + k * pNoOutliners;
				} else {
					pNoOutliners = 1.0;
				}
				hs_v[i] = 1.0;
			}

			__syncthreads();

			tid = TID + i + 1;
			while (tid < max_pts) {
				hs_v[tid] = V[i * max_pts + tid] / pNoOutliners;
				tid += T_WORKERS;
			}

			__syncthreads();

			// модификация матрицы
			if (!TID) {
				pNoOutliners = 0.0;
				for (k = i; k < max_pts; k++)
					pNoOutliners += hs_v[k] * hs_v[k];
				pNoOutliners = -2 / pNoOutliners;
			}

			__syncthreads();

			tid = TID + i;
			while (tid < degree + 1) {
				current_x = 0.0;
				for (k = i; k < max_pts; k++)
					current_x += V[k + tid * max_pts] * hs_v[k];
				hs_w[tid] = current_x * pNoOutliners;
				tid += T_WORKERS;
			}

			__syncthreads();

			tid = TID + i;
			while (tid < max_pts) {
				current_x = hs_v[tid];
				for (k = i; k < degree + 1; k++)
					V[tid + k * max_pts] += current_x * hs_w[k];
				tid += T_WORKERS;
			}

			__syncthreads();

			// запись нетривиальной части вектора Хаусхолдера
			tid = TID + i + 1;
			while (tid < max_pts) {
				V[tid + i * max_pts] = hs_v[tid];
				tid += T_WORKERS;
			}

			__syncthreads();

			// модификация правой части b->Q'b
			tid = TID + i;
			while (tid < max_pts) {
				current_x = b[i];
				for (k = i + 1; k < max_pts; k++)
					current_x += b[k] * V[k + i * max_pts];
				hs_v[tid] = current_x * pNoOutliners;
				tid += T_WORKERS;
			}

			__syncthreads();

			tid = TID + i + 1;
			while (tid < max_pts) {
				b[tid] += V[tid + i * max_pts] * hs_v[tid];
				tid += T_WORKERS;
			}

			__syncthreads();

			if (!TID) {
				b[i] += hs_v[i];
				i++;
			}
			__syncthreads();
		}

		__syncthreads();

		if (!TID)
			i = degree;

		__syncthreads();
		// решение верхнетреугольной системы Rx=Q'b
		while (i > 0) {
			if (!TID) {
				b[i] /= V[i + i * max_pts];
				pNoOutliners = b[i];
			}
			if (TID < i)
				b[TID] -= V[TID + i * max_pts] * pNoOutliners;

			if (!TID)
				i--;
			__syncthreads();
		}

		if (!TID)
			b[0] /= V[0];

		__syncthreads();

		if (TID <= degree) {
			coeff[TID] = b[TID];
			if (fabs(coeff[TID]) < EPS)
				coeff[TID] = 0.0;
		}

		__syncthreads();

		inliners[TID] = 0;
		tid = TID;
		while (tid < size) {
			current_x = x[tid];
			current_y = y[tid];
			temp = 0.0;
			for (k = degree; k >= 0; k--)
				//temp = temp*current_x+coeff[k];
				temp = fma(temp, current_x, coeff[k]);

			if (abs(temp - current_y) < tolerance)
				inliners[TID]++;
			tid += T_WORKERS;
		}

		__syncthreads();

		if (!TID) {
			for (k = 1; k < T_WORKERS; k++)
				inliners[0] += inliners[k];

			k = inliners[0];
			if (k > *p_inliners) {
				*p_inliners = k;
				for (i = 0; i <= degree; i++)
					p_coefs[i] = coeff[i];
			}
			trialCount++;
		}
		__syncthreads();
	}

}

Result polyransac(double* h_x, double* h_y, Params par) {
	const double pr = log(0.001);

	Result h_res;
	double* h_coefs = (double*) malloc( BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ) );
	int* h_results  = (int*)    malloc( BLOCK_NUMBER*sizeof( int ) );

	double* d_x;
	double* d_y;
	double* d_Vs;
	double* d_bs;
	double* d_coefs;
	double* d_hs_vs;
	double* d_hs_ws;
	int* d_results;

	CUDA_CALL(cudaMalloc((void ** )&d_x, par.size * sizeof(double)));
	CUDA_CALL(cudaMalloc((void ** )&d_y, par.size * sizeof(double)));
	CUDA_CALL(cudaMalloc( (void **)&d_Vs, BLOCK_NUMBER*(par.polyDegree+1)*par.maxMPts*sizeof( double ) ));
	CUDA_CALL(cudaMalloc( (void **)&d_bs, BLOCK_NUMBER*par.maxMPts*sizeof( double ) ));
	CUDA_CALL(cudaMalloc( (void **)&d_coefs, BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ) ));
	CUDA_CALL(cudaMalloc( (void **)&d_results, BLOCK_NUMBER*sizeof( int ) ));
	CUDA_CALL(cudaMalloc( (void **)&d_hs_vs, BLOCK_NUMBER*par.maxMPts*sizeof( double ) ));
	CUDA_CALL(cudaMalloc( (void **)&d_hs_ws, BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ) ));

	CUDA_CALL(cudaMemcpy(d_x, h_x, par.size * sizeof(double), cudaMemcpyHostToDevice));
	CUDA_CALL(cudaMemcpy(d_y, h_y, par.size * sizeof(double), cudaMemcpyHostToDevice));

	//Launch algo
	int counter = 0;
#ifdef BENCHMARK
	int needed = par.maxTrial;
#else
	int needed = BLOCK_NUMBER;
#endif
	h_res.inliners = -1;
	h_res.coef = (double*) malloc((par.polyDegree + 1) * sizeof(double));
	double delta;
	int iters;
	while (counter < needed) {
		iters = needed - counter;
		if (iters > MAX_ITERS)
			iters = MAX_ITERS;
		solveRLS<<<BLOCK_NUMBER, BLOCK_SIZE, (par.polyDegree+1)*sizeof( double )>>>(
			d_x,
			d_y,
			par,
			d_Vs,
			d_bs,
			d_coefs,
			d_results,
			d_hs_vs,
			d_hs_ws,
			iters,
			rand()
		);
		counter += ((iters + BLOCK_NUMBER - 1) / BLOCK_NUMBER) * BLOCK_NUMBER;

		CUDA_CALL(cudaMemcpy(h_results, d_results, BLOCK_NUMBER*sizeof( int ),                       cudaMemcpyDeviceToHost));
		CUDA_CALL(cudaMemcpy(h_coefs,   d_coefs,   BLOCK_NUMBER*(par.polyDegree+1)*sizeof( double ), cudaMemcpyDeviceToHost));
		for(int i = 0; i<BLOCK_NUMBER; i++){
			if (h_results[i] > h_res.inliners) {
				for(int j=0;j<par.polyDegree + 1;j++)
					h_res.coef[j] = h_coefs[i*(par.polyDegree+1) + j];
				h_res.inliners = h_results[i];
				delta = 1.0 - pow(1.0 * h_res.inliners / par.size, par.maxMPts);
				delta = min(delta, 1 - 1e-7);
				delta = max(delta, 1e-7);
#ifdef BENCHMARK
				continue;
#else
				needed = pr/log(delta);
				needed = min(needed, par.maxTrial);
#endif
			}
		}
	}

	h_res.iters = counter;

	//Free resources
	cudaFree(d_x);
	cudaFree(d_y);
	cudaFree(d_Vs);
	cudaFree(d_bs);
	cudaFree(d_hs_vs);
	cudaFree(d_hs_ws);
	cudaFree(d_coefs);
	cudaFree(d_results);

	return h_res;
}

int main(int argc, char** argv) {
	Params par;
#ifdef BENCHMARK
	par.size = 0;	//will be filled in function
	par.maxMPts = 10;
	par.maxTrial = 100000;
	par.polyDegree = 2;
	par.tolerance = 0.1;
	par.outliners = 0.3;
	massTest(par);

#else
	par.size = 10000;
	par.maxMPts = 50;
	par.maxTrial = 100000;
	par.polyDegree = 5;
	par.tolerance = 0.02;
	par.outliners = 0.05;
	forChart(par);
#endif
	return 0;
}

void generate_sample_with_Gaussian_noise(double *x, double *y, size_t size){
	double u_noise_r = 0.0;
	double u_noise_fi = 0.0;
	double n_noise = 0.0;
	double mul = 0.01;

	srand(123456);
	for (int i = 0; i < size; i++) {
		x[i] = 10.0 * i / size + 5.0;	//from 0 to 10.0
		y[i] = 20 * x[i] * x[i] + 5;	// 2*x*x + 0.5

		if (i % 2)
			y[i] += n_noise;
		else {
			u_noise_r = 1.0 * rand() / RAND_MAX;
			u_noise_fi = 1.0 * rand() / RAND_MAX;
			n_noise = mul * cos(2 * PI * u_noise_fi) * sqrt(-2 * log(u_noise_r));
			y[i] += n_noise;
			n_noise = mul * sin(2 * PI * u_noise_fi) * sqrt(-2 * log(u_noise_r));
		}
	}
}

void add_outliners(double *y, size_t size, double outliners){
	for (int i = 0; i < outliners * size; i++)
		y[i] += 1000;
}

void print_info(Params par, Result *res, unsigned long int ev_time){
	printf("\nSample length:\t%ld\n", par.size);
	printf("Estimated set length:\t%d\n", par.maxMPts);
	printf("Maximum iterations:\t%d\n", par.maxTrial);
	printf("Maximum degree of the result polynom:\t%d\n", par.polyDegree);
	printf("Tolerance threshold:\t%.8f\n", par.tolerance);
	printf("Maximum part of outliners:\t%.8f\n", par.outliners);
	printf(" ### Total time:\t%.6f\n\n", 1.0 * ev_time / CLOCKS_PER_SEC);
	printf("Iterations was:\t\t %d\n", res->iters);
	printf("\n\nInliners = %d\n", res->inliners);
	printf("\n\nCoefficients:\n\n");
	for (int i = par.polyDegree; i > -1; i--)
		printf("x^%d:\t%.15f\n", i, res->coef[i]);
	printf("\n");
}

void forChart(Params par) {
	Result res;
	res.coef = (double*) malloc((par.polyDegree + 1) * sizeof(double));

	double* x = (double*) malloc(sizeof(double) * par.size);
	double* y = (double*) malloc(sizeof(double) * par.size);

	generate_sample_with_Gaussian_noise(x, y, par.size);
	add_outliners(y, par.size, par.outliners);

	unsigned long int start_time = get_current_time_in_us();
	res = polyransac(x, y, par);
	unsigned long int execution_time = get_current_time_in_us() - start_time;

	print_info(par, &res, execution_time);

	free(res.coef);
	free(x);
	free(y);
}

void massTest(Params par) {
	int sizes[] = { 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000 };
	for (int j = 0; j < 9; j++) {
		Result res;
		par.size = sizes[j];
		res.coef = (double*) malloc((par.polyDegree + 1) * sizeof(double));

		double* x = (double*) malloc(sizeof(double) * par.size);
		double* y = (double*) malloc(sizeof(double) * par.size);

		generate_sample_with_Gaussian_noise(x, y, par.size);
		add_outliners(y, par.size, par.outliners);

		unsigned long int start_time = get_current_time_in_us();
		for (int pp = 0; pp < 10; pp++)
			res = polyransac(x, y, par);
		unsigned long int execution_time = get_current_time_in_us() - start_time;

		print_info(par, &res, execution_time);

		free(res.coef);
		free(x);
		free(y);
	}
}
	#define BLOCK_NUMBER 32
	#define BLOCK_SIZE 256
	#define EPS 1e-8
	//MAX_ITERS - максимальное число итераций, выполняемое за один запуск ядра
	#define MAX_ITERS 200*BLOCK_NUMBER
	#define MAX_DEGREE 8

	#define BENCHMARK

	#include <cuda_runtime.h>
	#include <sys/time.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <curand_kernel.h>

	unsigned long get_current_time_in_us() {
	struct timeval tv;
	gettimeofday(&tv, 0);
	return (tv.tv_sec * 1000000l + tv.tv_usec);
	}

	#define CUDA_CALL( err )\
	do{\
	if (err != cudaSuccess) {\
	printf("Error: %s at %s:%d\n", cudaGetErrorString(err), __FILE__, __LINE__);\
	exit(EXIT_FAILURE);\
	}\
	}while(0)

	const double PI = 3.14159265359;

	typedef struct Result {
	int inliners; // число инлайнеров
	double* coef; // коэффициенты модели
	int iters; // проведённое число итераций
	} Result;

	typedef struct Params {
	int polyDegree; // масимальная степень искомого многочлена
	int maxTrial; // максимальное число итераций
	int maxMPts; // число точек, выбираемое для оценки
	size_t size; // размер исходной выборки
	double tolerance; // пороговое значение
	double outliners;
	} Params;

	__global__ void solveRLS(double, double, Params, double, double, double,
	int, double, double, curandState, int);
	Result polyransac(double, double, Params);
	int main(int, char**);
	void forChart(Params);
	void massTest(Params);

	//
	// выполнение заданного числа итераций алгоритма RANSAC на устройстве
	// x, y - исходная выборка (x,y), по которым строится модель
	// par - параметры решения (максимальная степень и т.п.)
	//
	// указатели на общий буфер (каждый блок использует свой кусок внутри буфера):
	//
	// Vs - для хранения матриц (одна на блок) системы
	// bs - для правых частей системы
	// coefs - для "лучших" коэффициентов, полученных в блоке
	// in_liners - для количества инлайнеров в лучшей модели (в рамках блока)
	// hs_vs - для разложения Хаусхолдера
	// hs_ws - для разложения Хаусхолдера
	// rand_states - переменные для генерации случайных чисел (в каждом треде своя)
	//
	// iters - "заказанное" число итераций, для выполнения на устройстве
	//
	__global__ void solveRLS(
	double* x,
	double* y,
	Params par,
	double* Vs,
	double* bs,
	double* coefs,
	int* out_inliners,
	double* hs_vs,
	double* hs_ws,
	int iters,
	int seed) {

	__shared__ int inliners[BLOCK_SIZE];

	__shared__ int max_pts;
	__shared__ int size;
	__shared__ int degree;
	__shared__ double tolerance;

	__shared__ int* p_inliners;
	__shared__ double* p_coefs;

	__shared__ double* V;
	__shared__ double* b;

	__shared__ double* hs_v;
	__shared__ double* hs_w;

	__shared__ int trialCount;
	__shared__ double pNoOutliners;
	__shared__ int i;

	__shared__ int N;
	__shared__ double coeff[MAX_DEGREE];

	int k, current_index, tid;
	double current_x, current_y, temp;
	int TID = threadIdx.x;
	int UID = blockIdx.x;
	int WORKERS = gridDim.x;
	int T_WORKERS = blockDim.x;

	if (!TID) {
	max_pts = par.maxMPts;
	size = par.size;
	degree = par.polyDegree;
	tolerance = par.tolerance;

	p_inliners = out_inliners + UID;
	*p_inliners = -1;
	p_coefs = coefs + (degree + 1) * UID;

	V = Vs + (degree + 1) * max_pts * UID;
	b = bs + (max_pts) * UID;
	hs_v = hs_vs + max_pts * UID;
	hs_w = hs_ws + (degree + 1) * UID;
	N = (iters + WORKERS - 1) / WORKERS;
	trialCount = 0;
	}

	curandState local_state;
	curand_init(
	seed,
	TID + UID * T_WORKERS,
	0,
	&local_state
	);

	//begin ransac

	while (N > trialCount) {

	__syncthreads();

	tid = TID;
	while (tid < max_pts) {
	V[tid] = 1.0;
	tid += T_WORKERS;
	}

	__syncthreads();

	if (!TID) {
	//обеспечиваем полный ранг
	for (i = 0; i <= degree;) {
	current_index = curand(&local_state) % size;
	current_x = x[current_index];
	k = 0;
	while (k < i && V[max_pts + k] != current_x)
	k++;
	if (k != i)
	continue;
	V[i + max_pts] = current_x;
	b[i] = y[current_index];
	i++;
	}
	}

	__syncthreads();

	tid = TID + i;
	while (tid < max_pts) {
	current_index = curand(&local_state) % size;
	current_x = x[current_index];
	V[tid + max_pts] = current_x;
	b[tid] = y[current_index];
	tid += T_WORKERS;
	}

	__syncthreads();

	tid = TID;
	while (tid < max_pts) {
	k = 2;
	current_x = V[tid + max_pts];
	temp = current_x;
	while (k <= degree) {
	temp *= current_x;
	V[tid + k * max_pts] = temp;
	k++;
	}
	tid += T_WORKERS;
	}

	if (!TID) {
	i = 0;
	}

	__syncthreads();
	// решаем переопределённую систему
	// matrix by column-major a(i,j) = A(i+j*rows)
	while (i < degree + 1) {
	// вычисление очередного вектора Хаусхолдера
	if (!TID) {
	pNoOutliners = 0.0;
	for (k = i; k < max_pts; k++)
	pNoOutliners += V[i * max_pts + k] * V[i * max_pts + k];

	if (pNoOutliners > 1e-10) {
	pNoOutliners = sqrt(pNoOutliners);
	if (abs(V[i + i * max_pts]) < 1e-10)
	k = 0;
	else
	k = V[i + i * max_pts] > 0 ? 1 : -1;
	pNoOutliners = V[i + i * max_pts] + k * pNoOutliners;
	} else {
	pNoOutliners = 1.0;
	}
	hs_v[i] = 1.0;
	}

	__syncthreads();

	tid = TID + i + 1;
	while (tid < max_pts) {
	hs_v[tid] = V[i * max_pts + tid] / pNoOutliners;
	tid += T_WORKERS;
	}

	__syncthreads();

	// модификация матрицы
	if (!TID) {
	pNoOutliners = 0.0;
	for (k = i; k < max_pts; k++)
	pNoOutliners += hs_v[k] * hs_v[k];
	pNoOutliners = -2 / pNoOutliners;
	}

	__syncthreads();

	tid = TID + i;
	while (tid < degree + 1) {
	current_x = 0.0;
	for (k = i; k < max_pts; k++)
	current_x += V[k + tid * max_pts] * hs_v[k];
	hs_w[tid] = current_x * pNoOutliners;
	tid += T_WORKERS;
	}

	__syncthreads();

	tid = TID + i;
	while (tid < max_pts) {
	current_x = hs_v[tid];
	for (k = i; k < degree + 1; k++)
	V[tid + k * max_pts] += current_x * hs_w[k];
	tid += T_WORKERS;
	}

	__syncthreads();

	// запись нетривиальной части вектора Хаусхолдера
	tid = TID + i + 1;
	while (tid < max_pts) {
	V[tid + i * max_pts] = hs_v[tid];
	tid += T_WORKERS;
	}

	__syncthreads();

	// модификация правой части b->Q'b
	tid = TID + i;
	while (tid < max_pts) {
	current_x = b[i];
	for (k = i + 1; k < max_pts; k++)
	current_x += b[k] * V[k + i * max_pts];
	hs_v[tid] = current_x * pNoOutliners;
	tid += T_WORKERS;
	}

	__syncthreads();

	tid = TID + i + 1;
	while (tid < max_pts) {
	b[tid] += V[tid + i * max_pts] * hs_v[tid];
	tid += T_WORKERS;
	}

	__syncthreads();

	if (!TID) {
	b[i] += hs_v[i];
	i++;
	}
	__syncthreads();
	}

	__syncthreads();

	if (!TID)
	i = degree;

	__syncthreads();
	// решение верхнетреугольной системы Rx=Q'b
	while (i > 0) {
	if (!TID) {
	b[i] /= V[i + i * max_pts];
	pNoOutliners = b[i];
	}
	if (TID < i)
	b[TID] -= V[TID + i * max_pts] * pNoOutliners;

	if (!TID)
	i--;
	__syncthreads();
	}

	if (!TID)
	b[0] /= V[0];

	__syncthreads();

	if (TID <= degree) {
	coeff[TID] = b[TID];
	if (fabs(coeff[TID]) < EPS)
	coeff[TID] = 0.0;
	}

	__syncthreads();

	inliners[TID] = 0;
	tid = TID;
	while (tid < size) {
	current_x = x[tid];
	current_y = y[tid];
	temp = 0.0;
	for (k = degree; k >= 0; k--)
	//temp = temp*current_x+coeff[k];
	temp = fma(temp, current_x, coeff[k]);

	if (abs(temp - current_y) < tolerance)
	inliners[TID]++;
	tid += T_WORKERS;
	}

	__syncthreads();

	if (!TID) {
	for (k = 1; k < T_WORKERS; k++)
	inliners[0] += inliners[k];

	k = inliners[0];
	if (k > *p_inliners) {
	*p_inliners = k;
	for (i = 0; i <= degree; i++)
	p_coefs[i] = coeff[i];
	}
	trialCount++;
	}
	__syncthreads();
	}

	}

	Result polyransac(double* h_x, double* h_y, Params par) {
	const double pr = log(0.001);

	Result h_res;
	double* h_coefs = (double) malloc( BLOCK_NUMBER(par.polyDegree+1)*sizeof( double ) );
	int* h_results = (int) malloc( BLOCK_NUMBERsizeof( int ) );

	double* d_x;
	double* d_y;
	double* d_Vs;
	double* d_bs;
	double* d_coefs;
	double* d_hs_vs;
	double* d_hs_ws;
	int* d_results;

	CUDA_CALL(cudaMalloc((void ** )&d_x, par.size * sizeof(double)));
	CUDA_CALL(cudaMalloc((void ** )&d_y, par.size * sizeof(double)));
	CUDA_CALL(cudaMalloc( (void *)&d_Vs, BLOCK_NUMBER(par.polyDegree+1)par.maxMPtssizeof( double ) ));
	CUDA_CALL(cudaMalloc( (void *)&d_bs, BLOCK_NUMBERpar.maxMPts*sizeof( double ) ));
	CUDA_CALL(cudaMalloc( (void *)&d_coefs, BLOCK_NUMBER(par.polyDegree+1)*sizeof( double ) ));
	CUDA_CALL(cudaMalloc( (void *)&d_results, BLOCK_NUMBERsizeof( int ) ));
	CUDA_CALL(cudaMalloc( (void *)&d_hs_vs, BLOCK_NUMBERpar.maxMPts*sizeof( double ) ));
	CUDA_CALL(cudaMalloc( (void *)&d_hs_ws, BLOCK_NUMBER(par.polyDegree+1)*sizeof( double ) ));

	CUDA_CALL(cudaMemcpy(d_x, h_x, par.size * sizeof(double), cudaMemcpyHostToDevice));
	CUDA_CALL(cudaMemcpy(d_y, h_y, par.size * sizeof(double), cudaMemcpyHostToDevice));

	//Launch algo
	int counter = 0;
	#ifdef BENCHMARK
	int needed = par.maxTrial;
	#else
	int needed = BLOCK_NUMBER;
	#endif
	h_res.inliners = -1;
	h_res.coef = (double) malloc((par.polyDegree + 1) sizeof(double));
	double delta;
	int iters;
	while (counter < needed) {
	iters = needed - counter;
	if (iters > MAX_ITERS)
	iters = MAX_ITERS;
	solveRLS<<<BLOCK_NUMBER, BLOCK_SIZE, (par.polyDegree+1)*sizeof( double )>>>(
	d_x,
	d_y,
	par,
	d_Vs,
	d_bs,
	d_coefs,
	d_results,
	d_hs_vs,
	d_hs_ws,
	iters,
	rand()
	);
	counter += ((iters + BLOCK_NUMBER - 1) / BLOCK_NUMBER) * BLOCK_NUMBER;

	CUDA_CALL(cudaMemcpy(h_results, d_results, BLOCK_NUMBER*sizeof( int ), cudaMemcpyDeviceToHost));
	CUDA_CALL(cudaMemcpy(h_coefs, d_coefs, BLOCK_NUMBER(par.polyDegree+1)sizeof( double ), cudaMemcpyDeviceToHost));
	for(int i = 0; i<BLOCK_NUMBER; i++){
	if (h_results[i] > h_res.inliners) {
	for(int j=0;j<par.polyDegree + 1;j++)
	h_res.coef[j] = h_coefs[i*(par.polyDegree+1) + j];
	h_res.inliners = h_results[i];
	delta = 1.0 - pow(1.0 * h_res.inliners / par.size, par.maxMPts);
	delta = min(delta, 1 - 1e-7);
	delta = max(delta, 1e-7);
	#ifdef BENCHMARK
	continue;
	#else
	needed = pr/log(delta);
	needed = min(needed, par.maxTrial);
	#endif
	}
	}
	}

	h_res.iters = counter;

	//Free resources
	cudaFree(d_x);
	cudaFree(d_y);
	cudaFree(d_Vs);
	cudaFree(d_bs);
	cudaFree(d_hs_vs);
	cudaFree(d_hs_ws);
	cudaFree(d_coefs);
	cudaFree(d_results);

	return h_res;
	}

	int main(int argc, char** argv) {
	Params par;
	#ifdef BENCHMARK
	par.size = 0; //will be filled in function
	par.maxMPts = 10;
	par.maxTrial = 100000;
	par.polyDegree = 2;
	par.tolerance = 0.1;
	par.outliners = 0.3;
	massTest(par);

	#else
	par.size = 10000;
	par.maxMPts = 50;
	par.maxTrial = 100000;
	par.polyDegree = 5;
	par.tolerance = 0.02;
	par.outliners = 0.05;
	forChart(par);
	#endif
	return 0;
	}

	void generate_sample_with_Gaussian_noise(double x, double y, size_t size){
	double u_noise_r = 0.0;
	double u_noise_fi = 0.0;
	double n_noise = 0.0;
	double mul = 0.01;

	srand(123456);
	for (int i = 0; i < size; i++) {
	x[i] = 10.0 * i / size + 5.0; //from 0 to 10.0
	y[i] = 20 * x[i] * x[i] + 5; // 2xx + 0.5

	if (i % 2)
	y[i] += n_noise;
	else {
	u_noise_r = 1.0 * rand() / RAND_MAX;
	u_noise_fi = 1.0 * rand() / RAND_MAX;
	n_noise = mul * cos(2 * PI * u_noise_fi) * sqrt(-2 * log(u_noise_r));
	y[i] += n_noise;
	n_noise = mul * sin(2 * PI * u_noise_fi) * sqrt(-2 * log(u_noise_r));
	}
	}
	}

	void add_outliners(double *y, size_t size, double outliners){
	for (int i = 0; i < outliners * size; i++)
	y[i] += 1000;
	}

	void print_info(Params par, Result *res, unsigned long int ev_time){
	printf("\nSample length:\t%ld\n", par.size);
	printf("Estimated set length:\t%d\n", par.maxMPts);
	printf("Maximum iterations:\t%d\n", par.maxTrial);
	printf("Maximum degree of the result polynom:\t%d\n", par.polyDegree);
	printf("Tolerance threshold:\t%.8f\n", par.tolerance);
	printf("Maximum part of outliners:\t%.8f\n", par.outliners);
	printf(" ### Total time:\t%.6f\n\n", 1.0 * ev_time / CLOCKS_PER_SEC);
	printf("Iterations was:\t\t %d\n", res->iters);
	printf("\n\nInliners = %d\n", res->inliners);
	printf("\n\nCoefficients:\n\n");
	for (int i = par.polyDegree; i > -1; i--)
	printf("x^%d:\t%.15f\n", i, res->coef[i]);
	printf("\n");
	}

	void forChart(Params par) {
	Result res;
	res.coef = (double) malloc((par.polyDegree + 1) sizeof(double));

	double* x = (double) malloc(sizeof(double) par.size);
	double* y = (double) malloc(sizeof(double) par.size);

	generate_sample_with_Gaussian_noise(x, y, par.size);
	add_outliners(y, par.size, par.outliners);

	unsigned long int start_time = get_current_time_in_us();
	res = polyransac(x, y, par);
	unsigned long int execution_time = get_current_time_in_us() - start_time;

	print_info(par, &res, execution_time);

	free(res.coef);
	free(x);
	free(y);
	}

	void massTest(Params par) {
	int sizes[] = { 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000 };
	for (int j = 0; j < 9; j++) {
	Result res;
	par.size = sizes[j];
	res.coef = (double) malloc((par.polyDegree + 1) sizeof(double));

	double* x = (double) malloc(sizeof(double) par.size);
	double* y = (double) malloc(sizeof(double) par.size);

	generate_sample_with_Gaussian_noise(x, y, par.size);
	add_outliners(y, par.size, par.outliners);

	unsigned long int start_time = get_current_time_in_us();
	for (int pp = 0; pp < 10; pp++)
	res = polyransac(x, y, par);
	unsigned long int execution_time = get_current_time_in_us() - start_time;

	print_info(par, &res, execution_time);

	free(res.coef);
	free(x);
	free(y);
	}
	}