|
#include "aux.h" |
|
#include <algorithm> |
|
#include <atomic> |
|
#include <chrono> |
|
#include <iostream> |
|
#include <omp.h> |
|
#include <random> |
|
#include <vector> |
|
|
|
void process_arrays(std::vector<float> &res, const std::vector<int> &indices, |
|
const std::vector<float> &values) { |
|
int n = indices.size(); |
|
|
|
// Check if indices and values vectors are empty |
|
if (n == 0) |
|
return; |
|
|
|
#pragma omp parallel |
|
{ |
|
float acc = 0.0; // Accumulator for the current index |
|
int last_index = -1; // Initialize last_index with the first index |
|
|
|
#pragma omp for nowait |
|
for (int i = 0; i < n; ++i) { |
|
if (i == 0 || indices[i] != last_index) { |
|
if (last_index != -1) { |
|
atomic_add(&res[last_index], |
|
acc); // Update the last index with accumulated value |
|
} |
|
acc = values[i]; // Reset accumulator for new index |
|
last_index = indices[i]; // Update the last index |
|
} else { |
|
// Same index, accumulate the values |
|
acc += values[i]; |
|
} |
|
} |
|
// Update the last index after the loop |
|
atomic_add(&res[last_index], acc); |
|
} |
|
} |
|
|
|
void process_array_atomic(std::vector<float> &res, |
|
const std::vector<int> &indices, |
|
const std::vector<float> &values) { |
|
int n = indices.size(); |
|
|
|
// Check if indices and values vectors are empty |
|
if (n == 0) |
|
return; |
|
#pragma omp parallel |
|
{ |
|
#pragma omp for nowait |
|
for (int i = 0; i < n; ++i) { |
|
atomic_add(&res[indices[i]], values[i]); |
|
} |
|
} |
|
} |
|
|
|
void process_array_sequential(std::vector<float> &res, |
|
const std::vector<int> &indices, |
|
const std::vector<float> &values) { |
|
int n = indices.size(); |
|
|
|
// Check if indices and values vectors are empty |
|
if (n == 0) |
|
return; |
|
|
|
float acc = 0.0; // Accumulator for the current index |
|
int last_index = indices[0]; // Initialize last_index with the first index |
|
|
|
// Set the first value outside the loop to handle edge cases |
|
acc = values[0]; |
|
|
|
for (int i = 1; i < n; ++i) { |
|
if (indices[i] != last_index) { |
|
// Different index encountered, update the last index with accumulated |
|
// value |
|
res[last_index] += acc; |
|
acc = values[i]; // Reset accumulator for new index |
|
last_index = indices[i]; // Update the last index |
|
} else { |
|
// Same index, accumulate the values |
|
acc += values[i]; |
|
} |
|
} |
|
|
|
// Update the last index after the loop |
|
res[last_index] += acc; |
|
} |
|
|
|
int main() { |
|
// Example usage |
|
const int num_elements = 1000; // Number of elements in the vector |
|
const int num_reduce = 10; // Number of elements to reduce |
|
std::vector<float> res( |
|
num_elements, |
|
0.0f); // Initialize with appropriate size and default values |
|
std::vector<float> golden_res( |
|
num_elements, |
|
0.0f); // Initialize with appropriate size and default values |
|
std::random_device rd; // Obtain a random number from hardware |
|
std::mt19937 gen(rd()); // Seed the generator |
|
std::uniform_int_distribution<> distr(0, num_reduce); // Define the range |
|
|
|
std::vector<int> indices; |
|
// Generate random integers and add them to the vector |
|
for (int i = 0; i < num_elements; ++i) { |
|
indices.push_back(distr(gen)); |
|
} |
|
|
|
// Sort the vector |
|
std::sort(indices.begin(), indices.end()); |
|
std::vector<float> values; |
|
// Generate random values and add them to the vector |
|
for (int i = 0; i < num_elements; ++i) { |
|
values.push_back(rand() % 100 / 100.0f); |
|
} |
|
// warm up |
|
process_arrays(res, indices, values); |
|
process_array_sequential(golden_res, indices, values); |
|
for (int i = 0; i < num_reduce; ++i) { |
|
if (res[i] - golden_res[i] > 0.0001) { |
|
std::cout << "Error at index " << i << " res[i] = " << res[i] |
|
<< " golden_res[i] = " << golden_res[i] << std::endl; |
|
return 1; |
|
} |
|
} |
|
|
|
const int num_repeats = 10; // Number of times to repeat the function |
|
auto start = std::chrono::high_resolution_clock::now(); |
|
|
|
for (int i = 0; i < num_repeats; ++i) { |
|
process_arrays(res, indices, values); |
|
} |
|
|
|
auto stop = std::chrono::high_resolution_clock::now(); |
|
auto total_duration = |
|
std::chrono::duration_cast<std::chrono::microseconds>(stop - start); |
|
|
|
std::cout << "Total time for " << num_repeats |
|
<< " repetitions: " << total_duration.count() << " microseconds" |
|
<< std::endl; |
|
|
|
auto avg_duration = total_duration.count() / static_cast<double>(num_repeats); |
|
std::cout << "Average time per function call:(parallel) " << avg_duration |
|
<< " microseconds" << std::endl; |
|
|
|
start = std::chrono::high_resolution_clock::now(); |
|
|
|
for (int i = 0; i < num_repeats; ++i) { |
|
process_array_sequential(res, indices, values); |
|
} |
|
|
|
stop = std::chrono::high_resolution_clock::now(); |
|
total_duration = |
|
std::chrono::duration_cast<std::chrono::microseconds>(stop - start); |
|
|
|
std::cout << "Total time for " << num_repeats |
|
<< " repetitions: " << total_duration.count() << " microseconds" |
|
<< std::endl; |
|
|
|
avg_duration = total_duration.count() / static_cast<double>(num_repeats); |
|
std::cout << "Average time per function call:(sequential)" << avg_duration |
|
<< " microseconds" << std::endl; |
|
|
|
start = std::chrono::high_resolution_clock::now(); |
|
|
|
for (int i = 0; i < num_repeats; ++i) { |
|
process_array_atomic(res, indices, values); |
|
} |
|
stop = std::chrono::high_resolution_clock::now(); |
|
total_duration = |
|
std::chrono::duration_cast<std::chrono::microseconds>(stop - start); |
|
|
|
std::cout << "Total time for " << num_repeats |
|
<< " repetitions: " << total_duration.count() << " microseconds" |
|
<< std::endl; |
|
avg_duration = total_duration.count() / static_cast<double>(num_repeats); |
|
std::cout << "Average time per function call:(atomic)" << avg_duration |
|
<< " microseconds" << std::endl; |
|
return 0; |
|
} |
g++ -fopenmp segreduce.cpp -o segreduce