aolo2/bfs.cpp

## bfs.cpp
int main(int argc, char **argv)
{
	try
	{
        double t1 = omp_get_wtime();
        // считываем параметры командной сторки
        int scale = 12;
        int avg_degree = 15;
        string graph_type = "rmat";
        bool check = true;
        bool load_from_file = false;
        string file_name = "none";
        bool convert = false;
        string convert_name = "none";
        int iterations = 10;

        cout << "printing argv" << endl;
        for(int i = 0; i < argc; ++i)
            cout << argv[i] << endl;
        cout << "done" << endl;

        parse_cmd_params(argc, argv, scale, avg_degree, check, graph_type, load_from_file, file_name, convert, convert_name, iterations);
        cout << "cmd parameters parsed" << endl;

        Graph graph;

        if(convert)
        {
            cout << "convert mode: " << convert << endl;
            convert_real_graph(graph, convert_name, true);
            graph.save_to_binary_file(convert_name + ".el_graph");
            return 0;
        }

        if(load_from_file)
        {
            cout << "loading graph " << file_name << endl;
            graph.load_from_binary_file(file_name);
            cout << "loaded graph has " << graph.vertices_count << " vertices and " << graph.edges_count << " edges" << endl;
        }
        else
        {
            cout << "generating new graph" << endl;
            cout << "scale: " << scale << endl;
            cout << "avg_degree: " << avg_degree << endl;

            // генерируем граф
            if (graph_type == "rmat")
            {
                file_name = "rmat_" + std::to_string(scale) + "_" + std::to_string(avg_degree) + ".el_graph";
                generate_R_MAT(graph, pow(2.0, scale), avg_degree);
            }
            else if (graph_type == "random_uniform")
            {
                file_name = "ru_" + std::to_string(scale) + "_" + std::to_string(avg_degree) + ".el_graph";
                generate_random_uniform(graph, pow(2.0, scale), avg_degree);
            }
            else
            {
                cout << "Unknown graph type" << endl;
                return 1;
            }

            cout << "graph generated" << endl;
            graph.convert_to_undirected();

            cout << "conversion done" << endl;
            cout << "file name: " << file_name << endl;

            if(file_name != "none")
            {
                graph.save_to_binary_file(file_name);
                cout << "saved to " << file_name << " file" << endl;
            }
        }
        double t2 = omp_get_wtime();
        cout << "generation/load time: " << t2 - t1 << " sec" << endl;

        // преобразовываем граф
        GraphCSR csr_graph;
        cout << "conversion started" << endl;
        convert_edges_list_to_CSR(graph, csr_graph);
        cout << "converted" << endl;

        int *tmp;
        SAFE_CALL(cudaMalloc((void**)&tmp, sizeof(int)));
        cout << "test malloc done" << endl;

        double t_start = omp_get_wtime();

	int device_count = 1;
	cudaGetDeviceCount(&device_count);
	cout << "detected " << device_count << " devices" << endl;

        GraphCSR gpu_graph[device_count]; // VLA :((
	int *final_user_result;
	int final_last_source;

#pragma omp parallel num_threads(device_count)
{
	int t = omp_get_thread_num();
	cudaSetDevice(t);
	int *tmp22;
	cudaMalloc((void **) &tmp, sizeof(int));

	int *user_result;
	cudaMallocHost((void **) &user_result, csr_graph.vertices_count * sizeof(int));

	int *device_levels;
	cudaMalloc((void **) &device_levels, csr_graph.vertices_count * sizeof(int));

        // запускаем копирования данных
	int last_source = 0;

        t1 = omp_get_wtime();
        user_copy_graph_to_device(csr_graph, gpu_graph[t]);
        t2 = omp_get_wtime();

        cout << "Device->host copy time: " << t2 - t1 << " sec" << endl;

        // запускаем алгоритм
        cudaDeviceSynchronize();
        t1 = omp_get_wtime();
        cout << "will do " << iterations / device_count << " iterations" << endl;

        for(int i = 0; i < iterations / device_count; i++)
        {
            last_source = rand() % graph.vertices_count;
//	    printf("thread %d working on source %d\n", t, last_source);
            user_algorithm(gpu_graph[t], user_result, device_levels, last_source);
        }

	if (t == 0) {
		final_user_result = user_result;
		final_last_source = last_source;
	}

        cudaDeviceSynchronize();
        t2 = omp_get_wtime();
}
        double t_end = omp_get_wtime();
        cout << "BFS wall time: " << t2 - t1 << " sec" << endl;

#pragma omp parallel num_threads(device_count)
{
        free_memory(gpu_graph[omp_get_thread_num()]);
}

        cout << endl;
        cout << "#algorithm executed!" << endl;
        cout << "#perf: " << ((double)(iterations) * graph.edges_count) / ((t_end - t_start) * 1e6) << endl;
        cout << "#time: " << t_end - t_start << endl;
        cout << "#check: " << check << endl;

        // делаем проверку корректности каждый раз
        if(check)
        {
            verify_result(csr_graph, final_user_result, final_last_source);
        }


        // освобождаем память
//	cudaFreeHost(user_result);
//        cudaFree(device_levels);


	}
	catch (const char *error)
	{
		cout << error << endl;
	}
	catch (...)
	{
		cout << "unknown error" << endl;
	}

	return 0;
}
	int main(int argc, char **argv)
	{
	try
	{
	double t1 = omp_get_wtime();
	// считываем параметры командной сторки
	int scale = 12;
	int avg_degree = 15;
	string graph_type = "rmat";
	bool check = true;
	bool load_from_file = false;
	string file_name = "none";
	bool convert = false;
	string convert_name = "none";
	int iterations = 10;

	cout << "printing argv" << endl;
	for(int i = 0; i < argc; ++i)
	cout << argv[i] << endl;
	cout << "done" << endl;

	parse_cmd_params(argc, argv, scale, avg_degree, check, graph_type, load_from_file, file_name, convert, convert_name, iterations);
	cout << "cmd parameters parsed" << endl;

	Graph graph;

	if(convert)
	{
	cout << "convert mode: " << convert << endl;
	convert_real_graph(graph, convert_name, true);
	graph.save_to_binary_file(convert_name + ".el_graph");
	return 0;
	}

	if(load_from_file)
	{
	cout << "loading graph " << file_name << endl;
	graph.load_from_binary_file(file_name);
	cout << "loaded graph has " << graph.vertices_count << " vertices and " << graph.edges_count << " edges" << endl;
	}
	else
	{
	cout << "generating new graph" << endl;
	cout << "scale: " << scale << endl;
	cout << "avg_degree: " << avg_degree << endl;

	// генерируем граф
	if (graph_type == "rmat")
	{
	file_name = "rmat_" + std::to_string(scale) + "_" + std::to_string(avg_degree) + ".el_graph";
	generate_R_MAT(graph, pow(2.0, scale), avg_degree);
	}
	else if (graph_type == "random_uniform")
	{
	file_name = "ru_" + std::to_string(scale) + "_" + std::to_string(avg_degree) + ".el_graph";
	generate_random_uniform(graph, pow(2.0, scale), avg_degree);
	}
	else
	{
	cout << "Unknown graph type" << endl;
	return 1;
	}

	cout << "graph generated" << endl;
	graph.convert_to_undirected();

	cout << "conversion done" << endl;
	cout << "file name: " << file_name << endl;

	if(file_name != "none")
	{
	graph.save_to_binary_file(file_name);
	cout << "saved to " << file_name << " file" << endl;
	}
	}
	double t2 = omp_get_wtime();
	cout << "generation/load time: " << t2 - t1 << " sec" << endl;

	// преобразовываем граф
	GraphCSR csr_graph;
	cout << "conversion started" << endl;
	convert_edges_list_to_CSR(graph, csr_graph);
	cout << "converted" << endl;

	int *tmp;
	SAFE_CALL(cudaMalloc((void**)&tmp, sizeof(int)));
	cout << "test malloc done" << endl;

	double t_start = omp_get_wtime();

	int device_count = 1;
	cudaGetDeviceCount(&device_count);
	cout << "detected " << device_count << " devices" << endl;

	GraphCSR gpu_graph[device_count]; // VLA :((
	int *final_user_result;
	int final_last_source;

	#pragma omp parallel num_threads(device_count)
	{
	int t = omp_get_thread_num();
	cudaSetDevice(t);
	int *tmp22;
	cudaMalloc((void **) &tmp, sizeof(int));

	int *user_result;
	cudaMallocHost((void *) &user_result, csr_graph.vertices_count sizeof(int));

	int *device_levels;
	cudaMalloc((void *) &device_levels, csr_graph.vertices_count sizeof(int));

	// запускаем копирования данных
	int last_source = 0;

	t1 = omp_get_wtime();
	user_copy_graph_to_device(csr_graph, gpu_graph[t]);
	t2 = omp_get_wtime();

	cout << "Device->host copy time: " << t2 - t1 << " sec" << endl;

	// запускаем алгоритм
	cudaDeviceSynchronize();
	t1 = omp_get_wtime();
	cout << "will do " << iterations / device_count << " iterations" << endl;

	for(int i = 0; i < iterations / device_count; i++)
	{
	last_source = rand() % graph.vertices_count;
	// printf("thread %d working on source %d\n", t, last_source);
	user_algorithm(gpu_graph[t], user_result, device_levels, last_source);
	}

	if (t == 0) {
	final_user_result = user_result;
	final_last_source = last_source;
	}

	cudaDeviceSynchronize();
	t2 = omp_get_wtime();
	}
	double t_end = omp_get_wtime();
	cout << "BFS wall time: " << t2 - t1 << " sec" << endl;

	#pragma omp parallel num_threads(device_count)
	{
	free_memory(gpu_graph[omp_get_thread_num()]);
	}

	cout << endl;
	cout << "#algorithm executed!" << endl;
	cout << "#perf: " << ((double)(iterations) * graph.edges_count) / ((t_end - t_start) * 1e6) << endl;
	cout << "#time: " << t_end - t_start << endl;
	cout << "#check: " << check << endl;

	// делаем проверку корректности каждый раз
	if(check)
	{
	verify_result(csr_graph, final_user_result, final_last_source);
	}


	// освобождаем память
	// cudaFreeHost(user_result);
	// cudaFree(device_levels);


	}
	catch (const char *error)
	{
	cout << error << endl;
	}
	catch (...)
	{
	cout << "unknown error" << endl;
	}

	return 0;
	}