|
// Store tile values (w/ border) in shared memory: |
|
// - first compute and store the "border" values (by a subset of the threads of the block) |
|
// - then compute the "center" values (the one corresponding to each thread that will be used afterwards). |
|
// |
|
// "Center" values: o |
|
// "Border" values: c, h, v |
|
// |
|
// The "borders" values are classified in 3 groups: |
|
// - The first tile row + the left border of the 2nd row (range end: r0) |
|
// - The left and right borders values + the border and the last row (range end: r1) |
|
// - The last tile row minus the left border (range end: r2) |
|
// |
|
// Ex: 8x8 kernel + 1-pixel border |
|
// |
|
// Visualized in 2D: |
|
// c h h h h h h h h c |
|
// v o o o o o o o o v |
|
// v o o o o o o o o v |
|
// v o o o o o o o o v |
|
// v o o o o o o o o v |
|
// v o o o o o o o o v |
|
// v o o o o o o o o v |
|
// v o o o o o o o o v |
|
// v o o o o o o o o v |
|
// c h h h h h h h h c |
|
// |
|
// Visualized in 1D: |
|
// c h h h h h h h h c v o o o o o o o o v v ... v v c h h h h h h h h c |
|
// ^ ^ ^ |
|
// | | | |
|
// r0 r1 r2 |
|
// |
|
// The border values are indexed in the following order: |
|
// |
|
// Visualized in 2D: |
|
// 0 1 2 3 4 5 6 7 8 9 |
|
// 10 o o o o o o o o 11 |
|
// 12 o o o o o o o o 13 |
|
// 14 o o o o o o o o 15 |
|
// 16 o o o o o o o o 17 |
|
// 18 o o o o o o o o 19 |
|
// 20 o o o o o o o o 21 |
|
// 22 o o o o o o o o 23 |
|
// 24 o o o o o o o o 25 |
|
// 26 27 28 29 30 31 32 33 34 35 |
|
// |
|
// Visualized in 1D: |
|
// 0 1 2 3 4 5 6 7 8 9 10 o o o o o o o o 11 12 ... 25 26 27 28 29 30 31 32 33 34 35 |
|
// ^ ^ ^ |
|
// | | | |
|
// r0 r1 r2 |
|
// |
|
// |
|
// In shared memory the values are stored linearly row by row (10 by 10 values in the example). |
|
|
|
const int kBlockSizeX = ...; |
|
const int kBlockSizeY = ...; |
|
const int kBlockSize = BlockSizeX * BlockSizeY; |
|
const int kTileBorder = ...; |
|
const int kTileSizeX = kBlockSizeX + kTileBorder * 2; |
|
const int kTileSizeY = kBlockSizeY + kTileBorder * 2; |
|
|
|
const auto TileLinearIndexToTile2DCoords = [](int inTileLinearIdx, ivec2 tileSize) -> ivec2 |
|
{ |
|
return ivec2(inTileLinearIdx % tileSize.x, inTileLinearIdx / tileSize.x); |
|
}; |
|
|
|
const ivec2 blockMinCoords = ivec2(indexing.blockIdx.x * indexing.blockDim.x, indexing.blockIdx.y * indexing.blockDim.y); |
|
const ivec2 tileUpperLeft = ivec2(blockMinCoords.x - kTileBorder, blockMinCoords.y - kTileBorder); |
|
|
|
// Store tile border values in shared memory |
|
const int r0 = kTileSizeX * kTileBorder + kTileBorder; // Offset after top full border lines + 1st border on non-full border line |
|
const int r1 = kTileSizeX * kTileBorder + kBlockSizeY * kTileBorder * 2 + kTileBorder; |
|
const int r2 = kTileSizeX * kTileBorder + kBlockSizeY * kTileBorder * 2 + kTileSizeX * kTileBorder; |
|
uint numIterSM = (r2 + kBlockSize - 1) / kBlockSize; |
|
for(uint k = 0; k < numIterSM; ++k) |
|
{ |
|
uint m = indexing.localLinearThreadIdx + k * kBlockSize; |
|
|
|
if(m < r2) |
|
{ |
|
int borderCellIdx = -1; |
|
if(m < r0) |
|
{ |
|
borderCellIdx = m; |
|
} |
|
else if(m < r1) |
|
{ |
|
const int baseIdx = m - r0; |
|
const int c = baseIdx / (2 * kTileBorder); |
|
const int d = baseIdx % (2 * kTileBorder); |
|
borderCellIdx = r0 + c * kTileSizeX + kBlockSizeX + d; |
|
} |
|
else if(m < r2) |
|
{ |
|
const int baseIdx = m - r1; |
|
borderCellIdx = baseIdx + kTileSizeX * kTileBorder + kTileSizeX * kBlockSizeY + kTileBorder; |
|
} |
|
|
|
const ivec2 inTile2DCoords = TileLinearIndexToTile2DCoords(borderCellIdx, ivec2(kTileSizeX, kTileSizeY)); |
|
const ivec2 p = tileUpperLeft + inTile2DCoords; |
|
|
|
float value = func(p, ...); |
|
|
|
SMStoreXXX(borderCellIdx, value); |
|
} |
|
|
|
// Store tile center values in shared memory |
|
float value = func(currPixelCoords, ...); |
|
SMStoreXXX(currPixelTileLinearIdx, value); |
|
|
|
__syncthreads(); |