Skip to content

Instantly share code, notes, and snippets.

@kmfarley11
Last active January 28, 2020 02:52
Show Gist options
  • Save kmfarley11/54b4451902a93304a030617a51e1ddad to your computer and use it in GitHub Desktop.
Save kmfarley11/54b4451902a93304a030617a51e1ddad to your computer and use it in GitHub Desktop.
Byte Repeater: duplicate some number of bytes in an input buffer some number of times and store in an output buffer

Byte Repeater


duplicate some number of bytes in chunks from an input buffer to an output buffer

Contents:

  • python prototype (python3)
  • sequential implementation (cpp)
  • parallel implementation (cuda)
  • makefile example (mingw32-make)
  • example output (powershell)

Why? sometimes you just need to repeat some bytes! (up sampling, data redundancy, data padding)


Prototype (Python3) Implementation

# dupleproto.py
#
# PROTOTYPE (python3) VERSION of Byte Repeater
#   mostly just designed to make sense of how to do the indexing...
#
# Usage: `python3 dupleproto.py`

memsize = 10
num_rpt = 3
chunk_size = 2

orig = list(range(memsize))
print('orig =', orig)

if memsize % chunk_size != 0:
    print('error, chunk must be divisible...')
    exit(1)

nu = [0] * (memsize) * num_rpt
# print('nu =', nu)

for i in range(0, memsize, chunk_size):
    for j in range(chunk_size):
        for k in range(num_rpt):
            # IDX FROM BASE FOR NEW CHUNK j + k*chunk_size
            # BASE IDX FOR NEW CHUNK i*num_rpt
            # print(i+j, k, i*num_rpt + j + k*chunk_size)
            nu[i*num_rpt + j + k*chunk_size] = orig[i+j]

# print(len(nu))
print('nu =', nu)

Sequential (CPP) Implementation

// duple.cpp
// SERIAL (cpu) VERSION of Byte Repeater
//
// Usage: `g++ -o duple.exe duple.cpp && ./duple.exe`

#include <iostream>
#include <string.h>
#include <string>

using namespace std;


int main() {
    cout << "duple start" << endl;
    string indat = "0123456789";
    int memsize = indat.length();
    int rpt_chunk_size = 5;         // num bytes in repeated chunk
    int num_rpt = 4;                // num repetitions of each chunk_size

    ///////////////////////////////////////////////////////////////////////////
    if (memsize % rpt_chunk_size != 0) {
        cout << "PROBLEMATIC! (chunks must be divisible)" << endl;
        return 1;
    }
    int upsamp_size = memsize*num_rpt;
    char *upsampled_mem = (char*)malloc(upsamp_size);
    memset(upsampled_mem, 0, upsamp_size);
    ///////////////////////////////////////////////////////////////////////////
    // do duple (sequential)
    for (int i = 0; i < memsize; i += rpt_chunk_size) {
        for (int j = 0; j < rpt_chunk_size; j++) {
            for (int k = 0; k < num_rpt; k++) {
                // IDX FROM BASE FOR NEW CHUNK: j + k * chunk_size
                // BASE IDX FOR NEW CHUNK:      i * num_rpt
                int new_idx = i*num_rpt + j + k*rpt_chunk_size;
                upsampled_mem[new_idx] = indat[i+j];
            }
        }
    }
    // print results
    cout << endl;
    for (int i = 0; i < upsamp_size; i++) {
        cout << upsampled_mem[i];
    }
    cout << endl << endl;
    // cleanup
    free(upsampled_mem);
    ///////////////////////////////////////////////////////////////////////////

    return 0;
}

Parallel (CUDA) Implementation

// duple-gpu.cu
//
// PARALLEL (cuda) VERSION of Byte Repeater
//
// note: useful to figure out indexing: 
// https://cs.calvin.edu/courses/cs/374/CUDA/CUDA-Thread-Indexing-Cheatsheet.pdf
//  (using 1dgrid of 3dblocks here)
//
// TODO: consider how this scales and how adding blocks might work/change it
// TODO: gate the memsize, chunk size, and n_rpt by num thds available?
//
// Usage: `nvcc -o duple-gpu.exe duple-gpu.cu && ./duple-gpu.exe`

#include <iostream>
#include <string.h>

using namespace std;


__global__ void duple_kernel(char* dev_out, char* dev_in, int n_rpt, 
                             int chnk_rpt, int up_size, int in_size) {
    // gate the ops by in params (see sequential/cpu version for for-loop)
    if  (   threadIdx.x >= in_size   || threadIdx.x * chnk_rpt >= in_size
        ||  threadIdx.y >= in_size   || threadIdx.y >= chnk_rpt
        ||  threadIdx.z >= in_size   || threadIdx.z >= n_rpt)
        return;
    int x_idx = threadIdx.x * chnk_rpt;
    int new_idx = x_idx * n_rpt + threadIdx.y + threadIdx.z*chnk_rpt;    
    if (new_idx < up_size) {
        dev_out[new_idx] = dev_in[x_idx + threadIdx.y];
    }
}


int main() {
    ///////////////////////////////////////////////////////////////////////////
    cout << "duple start" << endl;
    const char *indat = "0123456789";
    int memsize = 10;
    int rpt_chunk_size = 5;         // num bytes in repeated chunk
    int num_rpt = 4;                // num repetitions of each chunk_size

    // Host setup
    if (memsize % rpt_chunk_size != 0) {
        cout << "PROBLEMATIC! (chunks must be divisible)" << endl;
        return 1;
    }
    int upsamp_size = memsize*num_rpt;
    char *upsampled_mem = (char*)malloc(upsamp_size);
    memset(upsampled_mem, 0, upsamp_size);
    
    ///////////////////////////////////////////////////////////////////////////
    // GPU setup
    dim3 block(1, 1, 1);
    dim3 grid(ceil(memsize/rpt_chunk_size), rpt_chunk_size, num_rpt);
    char *dev_usampled_mem, *dev_indat;
    cudaMalloc((void**)&dev_usampled_mem, upsamp_size);
    cudaMalloc((void**)&dev_indat, memsize);
    cudaMemcpy(dev_indat, indat, memsize, cudaMemcpyHostToDevice);
    // GPU exec
    duple_kernel<<<block,grid>>>(dev_usampled_mem,
                                 dev_indat, num_rpt,
                                 rpt_chunk_size,
                                 upsamp_size,
                                 memsize);
    // GPU get results and cleanup
    cudaDeviceSynchronize();
    cudaMemcpy(upsampled_mem, dev_usampled_mem, upsamp_size, 
               cudaMemcpyDeviceToHost);
    cudaFree(dev_indat);
    cudaFree(dev_usampled_mem);
    ///////////////////////////////////////////////////////////////////////////

    // print results
    cout << endl;
    cout << "inputted: ";
    for (int i = 0; i < memsize; i++) {
        cout << indat[i];
    }
    cout << endl << "outputted: ";
    for (int i = 0; i < upsamp_size; i++) {
        cout << upsampled_mem[i];
    }
    cout << endl << endl;

    // Host cleanup
    free(upsampled_mem);
    ///////////////////////////////////////////////////////////////////////////

    return 0;
}

Makefile example

# generic Makefile...
# mingw32-make compatible/tested/designed
CPP = mingw32-g++
CU = nvcc
INCDIR = C:\MinGW\include
CFLAGS = -I$(INCDIR)

all: duple duple-gpu

duple: duple.cpp
	$(CPP) -o $@.exe $^ $(CFLAGS)

duple-gpu: duple-gpu.cu
	$(CU) -o $@.exe $^

# if UNIX system change `del` to `rm`
clean:
	del *.exe

Example output

PS C:> # (tested/retrieved via powershell...)
PS C:> # prototype output
PS C:> py .\dupleproto.py
orig = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
nu = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
nu nu = [0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 8, 9, 8, 9, 8, 9]

PS C:> # sequential output
PS C:> mingw32-make duple; if ( $? ) { .\duple.exe }
mingw32-g++ -o duple.exe duple.cpp -IC:\MinGW\include
duple start

0123401234012340123456789567895678956789

PS C:> # parallel output
PS C:> mingw32-make duple-gpu; if ( $? ) { .\duple-gpu.exe } 
nvcc -o duple-gpu.exe duple-gpu.cu
duple-gpu.cu
   Creating library duple-gpu.lib and object duple-gpu.exp
duple start

inputted: 0123456789
outputted: 0123401234012340123456789567895678956789
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment