Skip to content

Instantly share code, notes, and snippets.

@roastduck
Created August 6, 2017 08:57
Show Gist options
  • Save roastduck/b5cbe4f6371e8e3c1ddf594cbc4d27ff to your computer and use it in GitHub Desktop.
Save roastduck/b5cbe4f6371e8e3c1ddf594cbc4d27ff to your computer and use it in GitHub Desktop.
Blocked Floyd–Warshall algorithm implemented with CUDA
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cuda_runtime.h>
#ifdef DUAL_GPU
#include <omp.h>
#endif
#ifndef NDEBUG
#define CHKERR(call) assert((call) == cudaSuccess) // no NDEBUG, so assert must be performed
#else
#define CHKERR(call) (call)
#endif
#ifdef DUAL_GPU
#define GPU_NUM 2
#define STREAM_NUM 4
#else
#define GPU_NUM 1
#define STREAM_NUM 1
#endif
inline void fastRead(FILE *f, int &x)
{
char c;
x = 0;
do c = fgetc(f); while (c < '0' || c > '9');
do x = x * 10 + c - '0', c = fgetc(f); while (c >= '0' && c <= '9');
}
/** Part of the graph
* This class is only a memory address convertor, which doesn't occupy the matrix
*/
class Block
{
public:
__host__ __device__ Block(int *_addr, int _height, int _width)
: addr(_addr), height(_height), width(_width) {}
__host__ __device__ int &operator()(int i, int j)
{
assert(i >= 0 && i < height);
assert(j >= 0 && j < width);
return *(addr + i * width + j);
}
int count() const
{
return height * width;
}
int *const addr;
const int height, width;
};
/** A full graph
* This class is only a memory address convertor, which doesn't occupy the matrix
*/
class Graph
{
public:
Graph()
: addr(NULL), n(0), blkFactor(0), blkNum(0) {}
Graph(int *_addr, int _n, int _blkFactor)
: addr(_addr), n(_n), blkFactor(_blkFactor), blkNum(n / blkFactor + (n % blkFactor > 0)) {}
__host__ __device__ Block operator()(int i, int j)
{
assert(i >= 0 && i < blkNum);
assert(j >= 0 && j < blkNum);
// DO NOT delete these two assertions for open interval use
// This function don't guarantee correctness when out of range
int height = min(blkFactor, n - i * blkFactor);
int width = min(blkFactor, n - j * blkFactor);
return Block(addr + (i * n + j * height) * blkFactor, height, width);
}
__host__ __device__ int &edge(int i, int j)
{
assert(i >= 0 && i < n);
assert(j >= 0 && j < n);
return (*this)(i / blkFactor, j / blkFactor)(i % blkFactor, j % blkFactor);
}
void copyToDevice(const Graph &device, cudaStream_t stream)
{
CHKERR(cudaMemcpyAsync(device.addr, addr, n * n * sizeof(int), cudaMemcpyHostToDevice, stream));
}
void copyToHost(const Graph &host)
{
CHKERR(cudaMemcpy(host.addr, addr, n * n * sizeof(int), cudaMemcpyDeviceToHost));
}
private:
int *addr; // cannot be const because this object should be copyable
public:
int n, blkFactor, blkNum;
};
extern __shared__ int sharedMem[];
#define quad(id, begin, span, bound, expr) \
if ((id = (begin)) < bound) { \
expr; \
if ((id = (begin) + (span)) < bound) { \
expr; \
if ((id = (begin) + 2 * (span)) < bound) { \
expr; \
if ((id = (begin) + 3 * (span)) < bound) { \
expr; \
} \
} \
} \
}
__global__ void phase1Kernel(Graph graph, int pivotId)
{
Block globalBlk(graph(pivotId, pivotId));
Block sharedBlk(sharedMem, globalBlk.height, globalBlk.width);
const int tid = threadIdx.x, span = blockDim.x, bound = sharedBlk.height * sharedBlk.width;
int id; // placeholder
quad(id, tid, span, bound, *(sharedBlk.addr + id) = *(globalBlk.addr + id));
__syncthreads();
assert(sharedBlk.width == sharedBlk.height);
#pragma unroll 64
for (int k = 0; k < sharedBlk.width; k++)
{
int i, j;
quad(id, tid, span, bound,
i = id / sharedBlk.width; j = id % sharedBlk.width; // can't use comma here because of the macro
sharedBlk(i, j) = min(sharedBlk(i, j), sharedBlk(i, k) + sharedBlk(k, j))
);
__syncthreads();
}
quad(id, tid, span, bound, *(globalBlk.addr + id) = *(sharedBlk.addr + id));
}
__global__ void phase2Kernel(Graph graph, int pivotId, int offset)
{
Block globalPivotBlk(graph(pivotId, pivotId));
int column = blockIdx.x; // 0 = row, 1 = column
assert(column == 0 || column == 1);
int vassalId = blockIdx.y + (blockIdx.y >= pivotId);
Block globalVassalBlk = column ? graph(vassalId, pivotId) : graph(pivotId, vassalId);
Block sharedPivotBlk(sharedMem, globalPivotBlk.height, globalPivotBlk.width);
Block sharedVassalBlk(sharedMem + sharedPivotBlk.width * sharedPivotBlk.height, globalVassalBlk.height, globalVassalBlk.width);
const int tid = threadIdx.x, span = blockDim.x;
int id; // placeholder
quad(id, tid, span, sharedPivotBlk.height * sharedPivotBlk.width, *(sharedPivotBlk.addr + id) = *(globalPivotBlk.addr + id));
quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width, *(sharedVassalBlk.addr + id) = *(globalVassalBlk.addr + id));
__syncthreads();
assert(sharedPivotBlk.width == sharedPivotBlk.height);
#pragma unroll 64
for (int k = 0; k < sharedPivotBlk.width; k++)
{
int i, j;
quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width,
i = id / sharedVassalBlk.width; j = id % sharedVassalBlk.width;
sharedVassalBlk(i, j) = min(
sharedVassalBlk(i, j),
column ? sharedVassalBlk(i, k) + sharedPivotBlk(k, j) : sharedPivotBlk(i, k) + sharedVassalBlk(k, j)
);
);
__syncthreads();
}
quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width, *(globalVassalBlk.addr + id) = *(sharedVassalBlk.addr + id));
}
__global__ void phase3Kernel(Graph graph, int pivotId, int offset)
{
const int rowId = blockIdx.x + offset + (blockIdx.x + offset >= pivotId);
Block globalLRBlk = graph(rowId, pivotId);
Block sharedLRBlk(sharedMem, globalLRBlk.height, globalLRBlk.width);
const int tid = threadIdx.x, span = blockDim.x;
int id; // placeholder
quad(id, tid, span, sharedLRBlk.height * sharedLRBlk.width, *(sharedLRBlk.addr + id) = *(globalLRBlk.addr + id));
for (int colId = !pivotId; colId < graph.blkNum; colId++, colId += (colId == pivotId))
{
Block globalUDBlk = graph(pivotId, colId);
Block sharedUDBlk(sharedMem + sharedLRBlk.width * sharedLRBlk.height, globalUDBlk.height, globalUDBlk.width);
quad(id, tid, span, sharedUDBlk.height * sharedUDBlk.width, *(sharedUDBlk.addr + id) = *(globalUDBlk.addr + id));
__syncthreads();
Block globalVassalBlk = graph(rowId, colId);
#pragma unroll 4
for (int iter = 0; iter < 4 && (id = tid + iter * span) < globalVassalBlk.height * globalVassalBlk.width; iter++)
{
// can't use macro here, because here is pragma
const int i = id / globalVassalBlk.width, j = id % globalVassalBlk.width;
int res = *(globalVassalBlk.addr + id);
#pragma unroll 64
for (int k = 0; k < sharedLRBlk.width; k++)
res = min(res, sharedLRBlk(i, k) + sharedUDBlk(k, j));
*(globalVassalBlk.addr + id) = res;
}
__syncthreads(); // Prevent the next assignment coming too soon
}
}
#ifdef DUAL_GPU
inline void phase3TaskCopy(int srcDev, int dstDev, Graph graphs[], cudaStream_t stream, int blkId, int beginTask, int endTask)
{
int n = graphs[0].n, blkFactor = graphs[0].blkFactor;
int beginId = beginTask + (beginTask >= blkId), endId = endTask + (endTask - 1 >= blkId);
assert(endId > beginId);
if (blkId >= beginId && blkId < endId) // Not interfering other stream at the same device
{
CHKERR(cudaMemcpyPeerAsync(
graphs[dstDev](beginId, 0).addr,
dstDev,
graphs[srcDev](beginId, 0).addr,
srcDev,
(blkId - beginId) * blkFactor * n * sizeof(int),
stream
));
CHKERR(cudaMemcpyPeerAsync(
graphs[dstDev](blkId + 1, 0).addr,
dstDev,
graphs[srcDev](blkId + 1, 0).addr,
srcDev,
(min(n, endId * blkFactor) - (blkId + 1) * blkFactor) * n * sizeof(int),
stream
));
} else
CHKERR(cudaMemcpyPeerAsync(
graphs[dstDev](beginId, 0).addr,
dstDev,
graphs[srcDev](beginId, 0).addr,
srcDev,
(min(n, endId * blkFactor) - beginId * blkFactor) * n * sizeof(int),
stream
));
}
template <void(*kernel)(Graph,int,int), void(*taskCopy)(int,int,Graph*,cudaStream_t,int,int,int)>
void dispatch(int taskNum, int threadsNum, int sharedSize, Graph graphs[], cudaStream_t streams[], int blkId)
{
const int DYNAMIC_CHUNK_SIZE = 14 * 1; // 14 = # of SM
#pragma omp parallel for num_threads(2) schedule(dynamic, 1)
for (int st = 0; st < taskNum; st += DYNAMIC_CHUNK_SIZE)
{
int tid = omp_get_thread_num();
assert(tid == 0 || tid == 1);
CHKERR(cudaSetDevice(tid));
int en = min(taskNum, st + DYNAMIC_CHUNK_SIZE);
kernel<<<en - st, threadsNum, sharedSize, streams[tid]>>>(graphs[tid], blkId, st);
CHKERR(cudaStreamSynchronize(streams[tid]));
taskCopy(tid, !tid, graphs, streams[tid + 2], blkId, st, en);
}
for (int i = 0; i < GPU_NUM; i++)
{
CHKERR(cudaSetDevice(i));
CHKERR(cudaDeviceSynchronize());
}
}
#endif // DUAL_GPU
#define div4(x) (((x) >> 2) + bool((x) & 3))
void stage(Graph graphs[], cudaStream_t streams[], int blkId)
{
int pivotCnt = graphs[0](blkId, blkId).count(), maxCnt = graphs[0].blkFactor * graphs[0].blkFactor;
int pivotSize = pivotCnt * sizeof(int), maxSize = maxCnt * sizeof(int);
for (int i = 0; i < GPU_NUM; i++)
{
CHKERR(cudaSetDevice(i));
phase1Kernel<<<1, div4(pivotCnt), pivotSize, streams[i]>>>(graphs[i], blkId);
if (graphs[0].blkNum > 1)
{
phase2Kernel<<<dim3(2, graphs[0].blkNum - 1), div4(maxCnt), pivotSize + maxSize, streams[i]>>>(graphs[i], blkId, 0);
#ifndef DUAL_GPU
phase3Kernel<<<graphs[0].blkNum - 1, div4(maxCnt), 2 * maxSize, streams[i]>>>(graphs[i], blkId, 0);
#endif
}
}
#ifdef DUAL_GPU
if (graphs[0].blkNum > 1)
dispatch<phase3Kernel, phase3TaskCopy>(graphs[0].blkNum - 1, div4(maxCnt), 2 * maxSize, graphs, streams, blkId);
#endif
}
int main(const int argc, const char *const *argv)
{
if (argc != 4)
{
puts("Usage: ./<executable name> <input file> <output file> <blocking factor>");
return 1;
}
const char *inName = argv[1], *outName = argv[2];
const int blkFactor = atoi(argv[3]);
static char inBuf[1 << 20];
FILE *inFile = fopen(inName, "r");
setvbuf(inFile, inBuf, _IOFBF, 1 << 20);
int n, m;
fastRead(inFile, n), fastRead(inFile, m);
#ifdef DUAL_GPU
{
cudaError_t err;
CHKERR(cudaSetDevice(0));
err = cudaDeviceEnablePeerAccess(1, 0), assert(err == cudaSuccess || err == cudaErrorPeerAccessAlreadyEnabled);
CHKERR(cudaSetDevice(1));
err = cudaDeviceEnablePeerAccess(0, 0), assert(err == cudaSuccess || err == cudaErrorPeerAccessAlreadyEnabled);
CHKERR(cudaSetDevice(0));
}
#endif
int *hostAddr;
CHKERR(cudaMallocHost((void**)&hostAddr, n * n * sizeof(int)));
Graph hostGraph(hostAddr, n, blkFactor);
int *deviceAddrs[GPU_NUM];
Graph deviceGraphs[GPU_NUM];
for (int i = 0; i < GPU_NUM; i++)
{
CHKERR(cudaSetDevice(i));
CHKERR(cudaMalloc((void**)(deviceAddrs + i), n * n * sizeof(int)));
deviceGraphs[i] = Graph(deviceAddrs[i], n, blkFactor);
}
cudaStream_t streams[STREAM_NUM]; // Default stream can't work correctly with cudaMemcpyPeerAsnyc
for (int i = 0; i < STREAM_NUM; i++)
{
CHKERR(cudaSetDevice(i & 1));
CHKERR(cudaStreamCreate(streams + i));
}
CHKERR(cudaSetDevice(0));
memset(hostAddr, 0x01, n * n * sizeof(int));
for (int i = 0; i < m; i++)
{
int x, y, w;
fastRead(inFile, x), fastRead(inFile, y), fastRead(inFile, w);
hostGraph.edge(x - 1, y - 1) = w;
}
fclose(inFile);
for (int i = 0; i < GPU_NUM; i++)
{
CHKERR(cudaSetDevice(i));
hostGraph.copyToDevice(deviceGraphs[i], streams[i]);
}
for (int i = 0; i < deviceGraphs[0].blkNum; i++)
stage(deviceGraphs, streams, i);
CHKERR(cudaSetDevice(0));
deviceGraphs[0].copyToHost(hostGraph);
FILE *outFile = fopen(outName, "w");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
int res = i == j ? 0 : hostGraph.edge(i, j);
if (res == 0x01010101)
fprintf(outFile, "%s ", "INF");
else
fprintf(outFile, "%d ", res);
}
fputc('\n', outFile);
}
fclose(outFile);
CHKERR(cudaFreeHost(hostAddr));
for (int i = 0; i < GPU_NUM; i++)
{
CHKERR(cudaSetDevice(i));
CHKERR(cudaFree(deviceAddrs[i]));
}
for (int i = 0; i < STREAM_NUM; i++)
{
CHKERR(cudaSetDevice(i & 1));
CHKERR(cudaStreamDestroy(streams[i]));
}
CHKERR(cudaSetDevice(0));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment