duplicate some number of bytes in chunks from an input buffer to an output buffer
Contents:
- python prototype (
python3
) - sequential implementation (
cpp
) - parallel implementation (
cuda
) - makefile example (
mingw32-make
) - example output (
powershell
)
Why? sometimes you just need to repeat some bytes! (up sampling, data redundancy, data padding)
# dupleproto.py
#
# PROTOTYPE (python3) VERSION of Byte Repeater
# mostly just designed to make sense of how to do the indexing...
#
# Usage: `python3 dupleproto.py`
memsize = 10
num_rpt = 3
chunk_size = 2
orig = list(range(memsize))
print('orig =', orig)
if memsize % chunk_size != 0:
print('error, chunk must be divisible...')
exit(1)
nu = [0] * (memsize) * num_rpt
# print('nu =', nu)
for i in range(0, memsize, chunk_size):
for j in range(chunk_size):
for k in range(num_rpt):
# IDX FROM BASE FOR NEW CHUNK j + k*chunk_size
# BASE IDX FOR NEW CHUNK i*num_rpt
# print(i+j, k, i*num_rpt + j + k*chunk_size)
nu[i*num_rpt + j + k*chunk_size] = orig[i+j]
# print(len(nu))
print('nu =', nu)
// duple.cpp
// SERIAL (cpu) VERSION of Byte Repeater
//
// Usage: `g++ -o duple.exe duple.cpp && ./duple.exe`
#include <iostream>
#include <string.h>
#include <string>
using namespace std;
int main() {
cout << "duple start" << endl;
string indat = "0123456789";
int memsize = indat.length();
int rpt_chunk_size = 5; // num bytes in repeated chunk
int num_rpt = 4; // num repetitions of each chunk_size
///////////////////////////////////////////////////////////////////////////
if (memsize % rpt_chunk_size != 0) {
cout << "PROBLEMATIC! (chunks must be divisible)" << endl;
return 1;
}
int upsamp_size = memsize*num_rpt;
char *upsampled_mem = (char*)malloc(upsamp_size);
memset(upsampled_mem, 0, upsamp_size);
///////////////////////////////////////////////////////////////////////////
// do duple (sequential)
for (int i = 0; i < memsize; i += rpt_chunk_size) {
for (int j = 0; j < rpt_chunk_size; j++) {
for (int k = 0; k < num_rpt; k++) {
// IDX FROM BASE FOR NEW CHUNK: j + k * chunk_size
// BASE IDX FOR NEW CHUNK: i * num_rpt
int new_idx = i*num_rpt + j + k*rpt_chunk_size;
upsampled_mem[new_idx] = indat[i+j];
}
}
}
// print results
cout << endl;
for (int i = 0; i < upsamp_size; i++) {
cout << upsampled_mem[i];
}
cout << endl << endl;
// cleanup
free(upsampled_mem);
///////////////////////////////////////////////////////////////////////////
return 0;
}
// duple-gpu.cu
//
// PARALLEL (cuda) VERSION of Byte Repeater
//
// note: useful to figure out indexing:
// https://cs.calvin.edu/courses/cs/374/CUDA/CUDA-Thread-Indexing-Cheatsheet.pdf
// (using 1dgrid of 3dblocks here)
//
// TODO: consider how this scales and how adding blocks might work/change it
// TODO: gate the memsize, chunk size, and n_rpt by num thds available?
//
// Usage: `nvcc -o duple-gpu.exe duple-gpu.cu && ./duple-gpu.exe`
#include <iostream>
#include <string.h>
using namespace std;
__global__ void duple_kernel(char* dev_out, char* dev_in, int n_rpt,
int chnk_rpt, int up_size, int in_size) {
// gate the ops by in params (see sequential/cpu version for for-loop)
if ( threadIdx.x >= in_size || threadIdx.x * chnk_rpt >= in_size
|| threadIdx.y >= in_size || threadIdx.y >= chnk_rpt
|| threadIdx.z >= in_size || threadIdx.z >= n_rpt)
return;
int x_idx = threadIdx.x * chnk_rpt;
int new_idx = x_idx * n_rpt + threadIdx.y + threadIdx.z*chnk_rpt;
if (new_idx < up_size) {
dev_out[new_idx] = dev_in[x_idx + threadIdx.y];
}
}
int main() {
///////////////////////////////////////////////////////////////////////////
cout << "duple start" << endl;
const char *indat = "0123456789";
int memsize = 10;
int rpt_chunk_size = 5; // num bytes in repeated chunk
int num_rpt = 4; // num repetitions of each chunk_size
// Host setup
if (memsize % rpt_chunk_size != 0) {
cout << "PROBLEMATIC! (chunks must be divisible)" << endl;
return 1;
}
int upsamp_size = memsize*num_rpt;
char *upsampled_mem = (char*)malloc(upsamp_size);
memset(upsampled_mem, 0, upsamp_size);
///////////////////////////////////////////////////////////////////////////
// GPU setup
dim3 block(1, 1, 1);
dim3 grid(ceil(memsize/rpt_chunk_size), rpt_chunk_size, num_rpt);
char *dev_usampled_mem, *dev_indat;
cudaMalloc((void**)&dev_usampled_mem, upsamp_size);
cudaMalloc((void**)&dev_indat, memsize);
cudaMemcpy(dev_indat, indat, memsize, cudaMemcpyHostToDevice);
// GPU exec
duple_kernel<<<block,grid>>>(dev_usampled_mem,
dev_indat, num_rpt,
rpt_chunk_size,
upsamp_size,
memsize);
// GPU get results and cleanup
cudaDeviceSynchronize();
cudaMemcpy(upsampled_mem, dev_usampled_mem, upsamp_size,
cudaMemcpyDeviceToHost);
cudaFree(dev_indat);
cudaFree(dev_usampled_mem);
///////////////////////////////////////////////////////////////////////////
// print results
cout << endl;
cout << "inputted: ";
for (int i = 0; i < memsize; i++) {
cout << indat[i];
}
cout << endl << "outputted: ";
for (int i = 0; i < upsamp_size; i++) {
cout << upsampled_mem[i];
}
cout << endl << endl;
// Host cleanup
free(upsampled_mem);
///////////////////////////////////////////////////////////////////////////
return 0;
}
# generic Makefile...
# mingw32-make compatible/tested/designed
CPP = mingw32-g++
CU = nvcc
INCDIR = C:\MinGW\include
CFLAGS = -I$(INCDIR)
all: duple duple-gpu
duple: duple.cpp
$(CPP) -o $@.exe $^ $(CFLAGS)
duple-gpu: duple-gpu.cu
$(CU) -o $@.exe $^
# if UNIX system change `del` to `rm`
clean:
del *.exe
PS C:> # (tested/retrieved via powershell...)
PS C:> # prototype output
PS C:> py .\dupleproto.py
orig = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
nu = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
nu nu = [0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9]
PS C:> # sequential output
PS C:> mingw32-make duple; if ( $? ) { .\duple.exe }
mingw32-g++ -o duple.exe duple.cpp -IC:\MinGW\include
duple start
0123401234012340123456789567895678956789
PS C:> # parallel output
PS C:> mingw32-make duple-gpu; if ( $? ) { .\duple-gpu.exe }
nvcc -o duple-gpu.exe duple-gpu.cu
duple-gpu.cu
Creating library duple-gpu.lib and object duple-gpu.exp
duple start
inputted: 0123456789
outputted: 0123401234012340123456789567895678956789