kmfarley11/duple.md

## duple.md

      
    Raw
  

              duple.md
            
          
    Byte Repeater


duplicate some number of bytes in chunks from an input buffer to an output buffer
Contents:

python prototype (python3)
sequential implementation (cpp)
parallel implementation (cuda)
makefile example (mingw32-make)
example output (powershell)

Why? sometimes you just need to repeat some bytes! (up sampling, data redundancy, data padding)

Prototype (Python3) Implementation

# dupleproto.py
#
# PROTOTYPE (python3) VERSION of Byte Repeater
#   mostly just designed to make sense of how to do the indexing...
#
# Usage: `python3 dupleproto.py`

memsize = 10
num_rpt = 3
chunk_size = 2

orig = list(range(memsize))
print('orig =', orig)

if memsize % chunk_size != 0:
    print('error, chunk must be divisible...')
    exit(1)

nu = [0] * (memsize) * num_rpt
# print('nu =', nu)

for i in range(0, memsize, chunk_size):
    for j in range(chunk_size):
        for k in range(num_rpt):
            # IDX FROM BASE FOR NEW CHUNK j + k*chunk_size
            # BASE IDX FOR NEW CHUNK i*num_rpt
            # print(i+j, k, i*num_rpt + j + k*chunk_size)
            nu[i*num_rpt + j + k*chunk_size] = orig[i+j]

# print(len(nu))
print('nu =', nu)
Sequential (CPP) Implementation

// duple.cpp
// SERIAL (cpu) VERSION of Byte Repeater
//
// Usage: `g++ -o duple.exe duple.cpp && ./duple.exe`

#include <iostream>
#include <string.h>
#include <string>

using namespace std;


int main() {
    cout << "duple start" << endl;
    string indat = "0123456789";
    int memsize = indat.length();
    int rpt_chunk_size = 5;         // num bytes in repeated chunk
    int num_rpt = 4;                // num repetitions of each chunk_size

    ///////////////////////////////////////////////////////////////////////////
    if (memsize % rpt_chunk_size != 0) {
        cout << "PROBLEMATIC! (chunks must be divisible)" << endl;
        return 1;
    }
    int upsamp_size = memsize*num_rpt;
    char *upsampled_mem = (char*)malloc(upsamp_size);
    memset(upsampled_mem, 0, upsamp_size);
    ///////////////////////////////////////////////////////////////////////////
    // do duple (sequential)
    for (int i = 0; i < memsize; i += rpt_chunk_size) {
        for (int j = 0; j < rpt_chunk_size; j++) {
            for (int k = 0; k < num_rpt; k++) {
                // IDX FROM BASE FOR NEW CHUNK: j + k * chunk_size
                // BASE IDX FOR NEW CHUNK:      i * num_rpt
                int new_idx = i*num_rpt + j + k*rpt_chunk_size;
                upsampled_mem[new_idx] = indat[i+j];
            }
        }
    }
    // print results
    cout << endl;
    for (int i = 0; i < upsamp_size; i++) {
        cout << upsampled_mem[i];
    }
    cout << endl << endl;
    // cleanup
    free(upsampled_mem);
    ///////////////////////////////////////////////////////////////////////////

    return 0;
}
Parallel (CUDA) Implementation

// duple-gpu.cu
//
// PARALLEL (cuda) VERSION of Byte Repeater
//
// note: useful to figure out indexing: 
// https://cs.calvin.edu/courses/cs/374/CUDA/CUDA-Thread-Indexing-Cheatsheet.pdf
//  (using 1dgrid of 3dblocks here)
//
// TODO: consider how this scales and how adding blocks might work/change it
// TODO: gate the memsize, chunk size, and n_rpt by num thds available?
//
// Usage: `nvcc -o duple-gpu.exe duple-gpu.cu && ./duple-gpu.exe`

#include <iostream>
#include <string.h>

using namespace std;


__global__ void duple_kernel(char* dev_out, char* dev_in, int n_rpt, 
                             int chnk_rpt, int up_size, int in_size) {
    // gate the ops by in params (see sequential/cpu version for for-loop)
    if  (   threadIdx.x >= in_size   || threadIdx.x * chnk_rpt >= in_size
        ||  threadIdx.y >= in_size   || threadIdx.y >= chnk_rpt
        ||  threadIdx.z >= in_size   || threadIdx.z >= n_rpt)
        return;
    int x_idx = threadIdx.x * chnk_rpt;
    int new_idx = x_idx * n_rpt + threadIdx.y + threadIdx.z*chnk_rpt;    
    if (new_idx < up_size) {
        dev_out[new_idx] = dev_in[x_idx + threadIdx.y];
    }
}


int main() {
    ///////////////////////////////////////////////////////////////////////////
    cout << "duple start" << endl;
    const char *indat = "0123456789";
    int memsize = 10;
    int rpt_chunk_size = 5;         // num bytes in repeated chunk
    int num_rpt = 4;                // num repetitions of each chunk_size

    // Host setup
    if (memsize % rpt_chunk_size != 0) {
        cout << "PROBLEMATIC! (chunks must be divisible)" << endl;
        return 1;
    }
    int upsamp_size = memsize*num_rpt;
    char *upsampled_mem = (char*)malloc(upsamp_size);
    memset(upsampled_mem, 0, upsamp_size);
    
    ///////////////////////////////////////////////////////////////////////////
    // GPU setup
    dim3 block(1, 1, 1);
    dim3 grid(ceil(memsize/rpt_chunk_size), rpt_chunk_size, num_rpt);
    char *dev_usampled_mem, *dev_indat;
    cudaMalloc((void**)&dev_usampled_mem, upsamp_size);
    cudaMalloc((void**)&dev_indat, memsize);
    cudaMemcpy(dev_indat, indat, memsize, cudaMemcpyHostToDevice);
    // GPU exec
    duple_kernel<<<block,grid>>>(dev_usampled_mem,
                                 dev_indat, num_rpt,
                                 rpt_chunk_size,
                                 upsamp_size,
                                 memsize);
    // GPU get results and cleanup
    cudaDeviceSynchronize();
    cudaMemcpy(upsampled_mem, dev_usampled_mem, upsamp_size, 
               cudaMemcpyDeviceToHost);
    cudaFree(dev_indat);
    cudaFree(dev_usampled_mem);
    ///////////////////////////////////////////////////////////////////////////

    // print results
    cout << endl;
    cout << "inputted: ";
    for (int i = 0; i < memsize; i++) {
        cout << indat[i];
    }
    cout << endl << "outputted: ";
    for (int i = 0; i < upsamp_size; i++) {
        cout << upsampled_mem[i];
    }
    cout << endl << endl;

    // Host cleanup
    free(upsampled_mem);
    ///////////////////////////////////////////////////////////////////////////

    return 0;
}
Makefile example

# generic Makefile...
# mingw32-make compatible/tested/designed
CPP = mingw32-g++
CU = nvcc
INCDIR = C:\MinGW\include
CFLAGS = -I$(INCDIR)

all: duple duple-gpu

duple: duple.cpp
	$(CPP) -o $@.exe $^ $(CFLAGS)

duple-gpu: duple-gpu.cu
	$(CU) -o $@.exe $^

# if UNIX system change `del` to `rm`
clean:
	del *.exe
Example output

PS C:> # (tested/retrieved via powershell...)
PS C:> # prototype output
PS C:> py .\dupleproto.py
orig = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
nu = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
nu nu = [0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9]

PS C:> # sequential output
PS C:> mingw32-make duple; if ( $? ) { .\duple.exe }
mingw32-g++ -o duple.exe duple.cpp -IC:\MinGW\include
duple start

0123401234012340123456789567895678956789

PS C:> # parallel output
PS C:> mingw32-make duple-gpu; if ( $? ) { .\duple-gpu.exe } 
nvcc -o duple-gpu.exe duple-gpu.cu
duple-gpu.cu
   Creating library duple-gpu.lib and object duple-gpu.exp
duple start

inputted: 0123456789
outputted: 0123401234012340123456789567895678956789