Skip to content

Instantly share code, notes, and snippets.

@gongzhitaao
Last active December 25, 2015 09:59
Show Gist options
  • Save gongzhitaao/6957806 to your computer and use it in GitHub Desktop.
Save gongzhitaao/6957806 to your computer and use it in GitHub Desktop.
Simple sequential and parallel implementation of convolution in C++
#include <fstream>
#include <mpi.h>
inline double convolute(double **img, const double (*w)[11],
int r, int c)
{
double s = 0;
for (int i = 0; i < 11; ++i)
for (int j = 0; j < 11; ++j)
s += img[r-5+i][c-5+j] * w[i][j];
return s;
}
int main()
{
MPI::Init();
int rank = MPI::COMM_WORLD.Get_rank();
int PE = MPI::COMM_WORLD.Get_size(); // number of thread
enum {
MAT = 180,
L = 5,
WIN = 2 * L + 1,
TMP = L + 1,
EXT = MAT + WIN,
C0 = L,
C1 = MAT+L
};
int MSG = MAT / PE,
BUF = MSG + WIN,
R1 = MSG + TMP;
double win[WIN][WIN];
for (int i = 0; i < WIN; ++i)
for (int j = 0; j < WIN; ++j)
win[i][j] = 1.0 / 121;
double *tmp_mem = new double[TMP * EXT];
double **tmp = new double *[TMP];
for (int i = 0; i < TMP; ++i)
tmp[i] = tmp_mem + i * EXT;
if (0 == rank) {
double *mat_mem = new double[EXT * EXT];
double **mat = new double *[EXT];
for (int i = 0; i < EXT; ++i)
mat[i] = mat_mem + i * EXT;
for (int i = TMP; i < MAT + TMP; ++i)
for (int j = C0; j < C1; j += 2)
mat[i][j] = 121;
for (int k = 1; k < PE; ++k)
MPI::COMM_WORLD.Send(mat[1 + k * MSG], (BUF-1) * EXT,
MPI::DOUBLE, k, 0);
for (int i = TMP; i < R1; ++i) {
for (int j = C0; j < C1; ++j) {
mat[i - TMP][j] = tmp[i % TMP][j];
tmp[i % TMP][j] = convolute(mat, win, i, j);
}
}
for (int i = R1 - TMP; i < R1; ++i)
for (int j = C0; j < C1; ++j)
mat[i][j] = tmp[i % TMP][j];
for (int k = 1; k < PE; ++k) {
MPI::COMM_WORLD.Recv(mat[TMP + k * MSG], MSG * EXT,
MPI::DOUBLE, k, 0);
}
// std::ofstream w("res1");
// for (int i = TMP; i < MAT + TMP; ++i) {
// for (int j = C0; j < C1; ++j) {
// w << mat[i][j] << ' ';
// }
// w << std::endl;
// }
// w.close();
delete [] mat_mem;
delete [] mat;
// std::ofstream w("log-parallel", std::ios::app);
// char msg[512];
// sprintf(msg, "%d,%f,%f", P, sum, t3-t0);
// w << msg << std::endl;
// w.close();
} else {
double *buf_mem = new double[BUF * EXT];
double **buf = new double *[BUF];
for (int i = 0; i < BUF; ++i)
buf[i] = buf_mem + i * EXT;
MPI::COMM_WORLD.Recv(buf[1], (BUF-1) * EXT, MPI::DOUBLE, 0, 0);
for (int i = TMP; i < R1; ++i) {
for (int j = C0; j < C1; ++j) {
buf[i - TMP][j] = tmp[i % TMP][j];
tmp[i % TMP][j] = convolute(buf, win, i, j);
}
}
for (int i = R1 - TMP; i < R1; ++i)
for (int j = C0; j < C1; ++j)
buf[i][j] = tmp[i % TMP][j];
MPI::COMM_WORLD.Send(buf[TMP], MSG * EXT, MPI::DOUBLE, 0, 0);
delete [] buf_mem;
delete [] buf;
}
delete [] tmp_mem;
MPI::Finalize();
}
#include <fstream>
#include <omp.h>
#define NUM_THREAD 6
inline double convolute(double **img, const double (*w)[11],
int r, int c)
{
double s = 0;
for (int i = 0; i < 11; ++i)
for (int j = 0; j < 11; ++j)
s += img[r-5+i][c-5+j] * w[i][j];
return s;
}
int main()
{
double t0 = omp_get_wtime();
enum ConstVal {
M = 2160, // square matrix size
C0 = 11, // window size
C1 = 6, // overlap region
C2 = 5, // margin for overlap region
C3 = M/NUM_THREAD, // total row for each thread
C4 = M/NUM_THREAD-C1, // safe rows for each thread
C5 = C1+2*C2, // info for overlap calc
S = M+2*C1 // extended matrix size
};
double win[C0][C0];
// allocation
double **mat = new double *[S];
for (int i = 0; i < S; ++i)
mat[i] = new double [S]();
double ***buf = new double **[NUM_THREAD];
for (int k = 0; k < NUM_THREAD; ++k) {
buf[k] = new double *[C1];
for (int i = 0; i < C1; ++i)
buf[k][i] = new double [S]();
}
double ***bak = new double **[NUM_THREAD];
for (int k = 0; k < NUM_THREAD; ++k) {
bak[k] = new double *[C5];
for (int i = 0; i < C5; ++i)
bak[k][i] = new double [S]();
}
double t1 = omp_get_wtime();
// initialization
for (int i = 0; i < C0; ++i)
for (int j = 0; j < C0; ++j)
win[i][j] = 1.0 / 121;
#pragma omp parallel for collapse(2) num_threads(NUM_THREAD)
for (int i = C1; i < M+C1; ++i)
for (int j = C1; j < M+C1; j += 2)
mat[i][j] = 121;
#pragma omp parallel for num_threads(NUM_THREAD)
for (int k = 0; k < NUM_THREAD; ++k)
for (int i = 0; i < C1; ++i)
for (int j = C1; j < M+C1; ++j)
buf[k][i][j] = mat[i+k*C3][j];
#pragma omp parallel for num_threads(NUM_THREAD)
for (int k = 0; k < NUM_THREAD; ++k)
for (int i = 0; i < C5; ++i)
for (int j = C1; j < M+C1; ++j)
bak[k][i][j] = mat[C1+(k+1)*C3-C0+i][j];
double t2 = omp_get_wtime();
#pragma omp parallel for num_threads(NUM_THREAD)
for (int k = 0; k < NUM_THREAD; ++k) {
for (int i = C1+k*C3; i < C1+C4+k*C3; ++i) {
for (int j = C1; j < M+C1; ++j) {
mat[i-C1][j] = buf[k][i%C1][j];
buf[k][i%C1][j] = convolute(mat, win, i, j);
}
}
for (int i = k*C3+C4; i < C1+C4+k*C3; ++i)
for (int j = C1; j < M+C1; ++j)
mat[i][j] = buf[k][i%C1][j];
}
#pragma omp parallel for num_threads(NUM_THREAD)
for (int k = 0; k < NUM_THREAD; ++k) {
for (int i = C2; i < C1+C2; ++i)
for (int j = C1; j < M+C1; ++j)
mat[C1+k*C3+C4-C2+i][j] = convolute(bak[k], win, i, j);
}
double t3 = omp_get_wtime();
for (int i = 0; i < S; ++i)
delete[] mat[i];
delete[] mat;
for (int k = 0; k < NUM_THREAD; ++k) {
for (int i = 0; i < C1; ++i)
delete[] buf[k][i];
delete[] buf[k];
}
double t4 = omp_get_wtime();
std::ofstream w("log-parallel", std::ios::app);
char msg[512];
sprintf(msg, "%d,%f,%f,%f,%f,%f\n",
NUM_THREAD, t1-t0, t2-t1, t3-t2, t4-t3, t4-t0);
w << msg << std::endl;
w.close();
return 0;
}
#include <fstream>
#include <omp.h>
inline double convolute(double **img, const double (*w)[11],
int r, int c)
{
double s = 0;
for (int i = 0; i < 11; ++i)
for (int j = 0; j < 11; ++j)
s += img[r-5+i][c-5+j] * w[i][j];
return s;
}
int main()
{
double t0 = omp_get_wtime();
const int M = 2160;
const int WIN_SIZE = 11;
const int WIN_HALF = 6;
const int beg = WIN_HALF, end = M + WIN_HALF;
double win[WIN_SIZE][WIN_SIZE];
// allocation
double **mat = new double *[M + 2*WIN_HALF];
for (int i = 0; i < M + 2*WIN_HALF; ++i)
mat[i] = new double [M+2*WIN_HALF]();
double **buf = new double *[WIN_HALF];
for (int i = 0; i < WIN_HALF; ++i)
buf[i] = new double [M+2*WIN_HALF]();
double t1 = omp_get_wtime();
// initialization
for (int i = 0; i < WIN_SIZE; ++i)
for (int j = 0; j < WIN_SIZE; ++j)
win[i][j] = 1.0 / 121;
for (int i = beg; i < end; ++i)
for (int j = beg; j < end; j += 2)
mat[i][j] = 121;
double t2 = omp_get_wtime();
// convolution
for (int i = beg; i < end; ++i) {
for (int j = beg; j < end; ++j) {
mat[i-WIN_HALF][j] = buf[i%WIN_HALF][j];
buf[i%WIN_HALF][j] = convolute(mat, win, i, j);
}
}
for (int i = end-WIN_HALF; i < end; ++i)
for (int j = beg; j < end; ++j)
mat[i][j] = buf[i%WIN_HALF][j];
double t3 = omp_get_wtime();
for (int i = 0; i < M+2*WIN_HALF; ++i)
delete[] mat[i];
for (int i = 0; i < WIN_HALF; ++i)
delete[] buf[i];
double t4 = omp_get_wtime();
std::ofstream w("res-sequential");
char msg[512];
sprintf(msg, "1,%f,%f,%f,%f,%f\n",
t1-t0, t2-t1, t3-t2, t4-t3,t4-t0);
w << msg << std::endl;
w.close();
return 0;
}
import numpy as np
import matplotlib.pyplot as plt
def exec_time(tm):
width = 0.35
fig = plt.figure()
ax = fig.add_subplot(111)
# alloc = ax.bar(tm[:, 0], tm[:, 1], width, fill=False, hatch='..')
# init = ax.bar(tm[:, 0], tm[:, 2], width, bottom=tm[:, 1],
# fill=False, hatch='..')
free = ax.bar(tm[:, 0], tm[:, 4]+tm[:,1]+tm[:,2], width,
fill=False, hatch='//')
conv = ax.bar(tm[:, 0], tm[:, 3], width, fill=False,
bottom=tm[:, 1]+tm[:, 2]+tm[:, 4])
ax.set_xlim(-width/2, 6+width+width/2)
# ax.set_ylim(0,0.027)
ax.set_ylabel('Time (s)')
ax.set_title('Convolution')
ax.set_xticks(tm[:,0]+width/2)
ax.set_xticklabels(['Sequential', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6'])
ax.grid()
# ax.legend((alloc, init, free, conv),
# ('Allocation', 'Initialization', 'Free', 'Convolute'))
ax.legend((free, conv),
('misc', 'Convolute'))
def speedup(tm):
width = 0.35
fig = plt.figure()
ax = fig.add_subplot(111)
speed_up = [tm[0,3]/t for t in tm[1:,3]]
x = np.arange(6)
ax.bar(x, speed_up, width, fill=False)
ax.set_xlim(-width/2, 5+width+width/2)
ax.set_ylabel('Speedup Factor')
ax.set_title('Speedup (Convolution)')
ax.set_xticks(x+width/2)
ax.set_xticklabels(['P1', 'P2', 'P3', 'P4', 'P5', 'P6'])
ax.grid()
def efficiency(tm):
width = 0.35
fig = plt.figure()
ax = fig.add_subplot(111)
efficiency = [tm[0,5]/(t*(i+1)) for i, t in enumerate(tm[1:,5])]
x = np.arange(6)
ax.bar(x, efficiency, width, fill=False)
ax.set_xlim(-width/2, 5+width+width/2)
ax.set_ylabel('Efficiency Factor')
ax.set_title('Efficiency')
ax.set_xticks(x+width/2)
ax.set_xticklabels(['P1', 'P2', 'P3', 'P4', 'P5', 'P6'])
ax.grid()
if __name__ == '__main__':
tm = np.genfromtxt('timelog',delimiter=',')
exec_time(tm)
// speedup(tm)
// efficiency(tm)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment