|
// Barrier usage example. |
|
// Compile with: gcc -O2 -Wall -pthread barrier-example.c -o barrier-example |
|
|
|
// IMPORTANT NOTE: borrowed from GPU massively/fine grained parallel world, |
|
// this reduction method is NOT suitable for real apps with pthreads!!! |
|
|
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
|
|
#include <pthread.h> |
|
|
|
|
|
// size of array for reduction |
|
#define N 10000 |
|
// number of threads - MUST be power of 2 |
|
#define THREADS 32 |
|
|
|
// how many elements a thread will process - NOTE: surrounding ()! |
|
#define BLOCKSIZE ((N+THREADS-1)/THREADS) |
|
|
|
// struct of info passed to each thread |
|
struct thread_params { |
|
int *start; // pointer to start of thread's portion |
|
int *out; // where to write partial sum |
|
int n; // how many consecutive elements to process |
|
int id; // thread's id |
|
}; |
|
|
|
// thread syncing barrier |
|
pthread_barrier_t barrier; |
|
|
|
// the thread work function |
|
void *thread_func(void *args) { |
|
struct thread_params *tparm; |
|
int *pa,*out; |
|
int i,n,sum,id,stride; |
|
|
|
// thread input params |
|
tparm = (struct thread_params *)args; |
|
pa = tparm->start; |
|
n = tparm->n; |
|
out = tparm->out; |
|
id = tparm->id; |
|
|
|
// phase 1: each thread reduces its part |
|
sum = 0; |
|
for (i=0;i<n;i++) { |
|
sum += pa[i]; |
|
} |
|
// store partial sum |
|
*out = sum; |
|
printf("Thread %d: partial sum=%d\n",id,sum); |
|
|
|
// phase 2: half of threads in each round sum a pair of values |
|
for (stride=1;stride<THREADS;stride*=2) { |
|
// sync on barrier, for all threads |
|
pthread_barrier_wait(&barrier); // after sync, barrier goes to its init() state |
|
|
|
if (id%(2*stride)==0) { // half of previous round |
|
// NOTE: this printf will show that all operations with same stride are together thanks to barrier |
|
printf("Thread %d: summing %d + %d (stride is %d)\n",id,*out,*(out+stride),stride); |
|
*out = (*out)+*(out+stride); |
|
} |
|
} |
|
|
|
// exit and let be joined |
|
pthread_exit(NULL); |
|
} |
|
|
|
|
|
|
|
int main() { |
|
int *a; |
|
int i,check; |
|
|
|
// array of structs to fill and pass to threads on creation |
|
struct thread_params tparm[THREADS]; |
|
// table of thread IDs (handles) filled on creation, to be used later on join |
|
pthread_t threads[THREADS]; |
|
|
|
// partial reduction table - one element per thread |
|
int partial[THREADS]; |
|
|
|
// initialize barrier - always on all threads |
|
pthread_barrier_init (&barrier, NULL, THREADS); |
|
|
|
// allocate vector array |
|
a = (int *)malloc(N*sizeof(int)); |
|
if (a==NULL) exit(1); |
|
|
|
//initialize vector |
|
for (i=0;i<N;i++) { |
|
a[i]=i+1; // 1...N |
|
} |
|
|
|
// create all threads |
|
check = 0; |
|
for (i=0;i<THREADS;i++) { |
|
// fill params for this thread |
|
tparm[i].start = a+i*BLOCKSIZE; |
|
tparm[i].id = i; |
|
tparm[i].out = &partial[i]; |
|
if ((check+BLOCKSIZE)>=N) { // less than blocksize to do... |
|
tparm[i].n = N-check; |
|
} |
|
else { |
|
tparm[i].n = BLOCKSIZE; // there IS blocksize work to do! |
|
} |
|
check += BLOCKSIZE; |
|
|
|
// create thread with default attrs (attrs=NULL) |
|
if (pthread_create(&threads[i],NULL,thread_func,&tparm[i])!=0) { |
|
printf("Error in thread creation!\n"); |
|
exit(1); |
|
} |
|
} |
|
|
|
// block on join of threads |
|
for (i=0;i<THREADS;i++) { |
|
pthread_join(threads[i],NULL); |
|
} |
|
|
|
|
|
// check results |
|
if (partial[0]!=((N*(N+1))/2)) { |
|
printf("computation error!\n"); |
|
} |
|
// free vector |
|
free(a); |
|
|
|
return 0; |
|
} |