roastduck/floyd.cu

## floyd.cu
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cuda_runtime.h>
#ifdef DUAL_GPU
    #include <omp.h>
#endif

#ifndef NDEBUG
    #define CHKERR(call) assert((call) == cudaSuccess) // no NDEBUG, so assert must be performed
#else
    #define CHKERR(call) (call)
#endif

#ifdef DUAL_GPU
    #define GPU_NUM 2
    #define STREAM_NUM 4
#else
    #define GPU_NUM 1
    #define STREAM_NUM 1
#endif

inline void fastRead(FILE *f, int &x)
{
    char c;
    x = 0;
    do c = fgetc(f); while (c < '0' || c > '9');
    do x = x * 10 + c - '0', c = fgetc(f); while (c >= '0' && c <= '9');
}

/** Part of the graph
 *  This class is only a memory address convertor, which doesn't occupy the matrix
 */
class Block
{
public:
    __host__ __device__ Block(int *_addr, int _height, int _width)
        : addr(_addr), height(_height), width(_width) {}

    __host__ __device__ int &operator()(int i, int j)
    {
        assert(i >= 0 && i < height);
        assert(j >= 0 && j < width);
        return *(addr + i * width + j);
    }

    int count() const
    {
        return height * width;
    }

    int *const addr;
    const int height, width;
};

/** A full graph
 *  This class is only a memory address convertor, which doesn't occupy the matrix
 */
class Graph
{
public:
    Graph()
        : addr(NULL), n(0), blkFactor(0), blkNum(0) {}

    Graph(int *_addr, int _n, int _blkFactor)
        : addr(_addr), n(_n), blkFactor(_blkFactor), blkNum(n / blkFactor + (n % blkFactor > 0)) {}

    __host__ __device__ Block operator()(int i, int j)
    {
        assert(i >= 0 && i < blkNum);
        assert(j >= 0 && j < blkNum);
        // DO NOT delete these two assertions for open interval use
        // This function don't guarantee correctness when out of range
        int height = min(blkFactor, n - i * blkFactor);
        int width = min(blkFactor, n - j * blkFactor);
        return Block(addr + (i * n + j * height) * blkFactor, height, width);
    }

    __host__ __device__ int &edge(int i, int j)
    {
        assert(i >= 0 && i < n);
        assert(j >= 0 && j < n);
        return (*this)(i / blkFactor, j / blkFactor)(i % blkFactor, j % blkFactor);
    }

    void copyToDevice(const Graph &device, cudaStream_t stream)
    {
        CHKERR(cudaMemcpyAsync(device.addr, addr, n * n * sizeof(int), cudaMemcpyHostToDevice, stream));
    }

    void copyToHost(const Graph &host)
    {
        CHKERR(cudaMemcpy(host.addr, addr, n * n * sizeof(int), cudaMemcpyDeviceToHost));
    }

private:
    int *addr; // cannot be const because this object should be copyable

public:
    int n, blkFactor, blkNum;
};

extern __shared__ int sharedMem[];

#define quad(id, begin, span, bound, expr) \
    if ((id = (begin)) < bound) { \
        expr; \
        if ((id = (begin) + (span)) < bound) { \
            expr; \
            if ((id = (begin) + 2 * (span)) < bound) { \
                expr; \
                if ((id = (begin) + 3 * (span)) < bound) { \
                    expr; \
                } \
            } \
        } \
    }

__global__ void phase1Kernel(Graph graph, int pivotId)
{
    Block globalBlk(graph(pivotId, pivotId));
    Block sharedBlk(sharedMem, globalBlk.height, globalBlk.width);
    const int tid = threadIdx.x, span = blockDim.x, bound = sharedBlk.height * sharedBlk.width;
    int id; // placeholder
    quad(id, tid, span, bound, *(sharedBlk.addr + id) = *(globalBlk.addr + id));
    __syncthreads();

    assert(sharedBlk.width == sharedBlk.height);
#pragma unroll 64
    for (int k = 0; k < sharedBlk.width; k++)
    {
        int i, j;
        quad(id, tid, span, bound,
            i = id / sharedBlk.width; j = id % sharedBlk.width; // can't use comma here because of the macro
            sharedBlk(i, j) = min(sharedBlk(i, j), sharedBlk(i, k) + sharedBlk(k, j))
        );
        __syncthreads();
    }

    quad(id, tid, span, bound, *(globalBlk.addr + id) = *(sharedBlk.addr + id));
}

__global__ void phase2Kernel(Graph graph, int pivotId, int offset)
{
    Block globalPivotBlk(graph(pivotId, pivotId));
    int column = blockIdx.x; // 0 = row, 1 = column
    assert(column == 0 || column == 1);
    int vassalId = blockIdx.y + (blockIdx.y >= pivotId);
    Block globalVassalBlk = column ? graph(vassalId, pivotId) : graph(pivotId, vassalId);
    Block sharedPivotBlk(sharedMem, globalPivotBlk.height, globalPivotBlk.width);
    Block sharedVassalBlk(sharedMem + sharedPivotBlk.width * sharedPivotBlk.height, globalVassalBlk.height, globalVassalBlk.width);
    const int tid = threadIdx.x, span = blockDim.x;
    int id; // placeholder
    quad(id, tid, span, sharedPivotBlk.height * sharedPivotBlk.width, *(sharedPivotBlk.addr + id) = *(globalPivotBlk.addr + id));
    quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width, *(sharedVassalBlk.addr + id) = *(globalVassalBlk.addr + id));
    __syncthreads();

    assert(sharedPivotBlk.width == sharedPivotBlk.height);
#pragma unroll 64
    for (int k = 0; k < sharedPivotBlk.width; k++)
    {
        int i, j;
        quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width,
            i = id / sharedVassalBlk.width; j = id % sharedVassalBlk.width;
            sharedVassalBlk(i, j) = min(
                sharedVassalBlk(i, j),
                column ? sharedVassalBlk(i, k) + sharedPivotBlk(k, j) : sharedPivotBlk(i, k) + sharedVassalBlk(k, j)
            );
        );
        __syncthreads();
    }

    quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width, *(globalVassalBlk.addr + id) = *(sharedVassalBlk.addr + id));
}

__global__ void phase3Kernel(Graph graph, int pivotId, int offset)
{
    const int rowId = blockIdx.x + offset + (blockIdx.x + offset >= pivotId);
    Block globalLRBlk = graph(rowId, pivotId);
    Block sharedLRBlk(sharedMem, globalLRBlk.height, globalLRBlk.width);
    const int tid = threadIdx.x, span = blockDim.x;
    int id; // placeholder
    quad(id, tid, span, sharedLRBlk.height * sharedLRBlk.width, *(sharedLRBlk.addr + id) = *(globalLRBlk.addr + id));
    for (int colId = !pivotId; colId < graph.blkNum; colId++, colId += (colId == pivotId))
    {
        Block globalUDBlk = graph(pivotId, colId);
        Block sharedUDBlk(sharedMem + sharedLRBlk.width * sharedLRBlk.height, globalUDBlk.height, globalUDBlk.width);
        quad(id, tid, span, sharedUDBlk.height * sharedUDBlk.width, *(sharedUDBlk.addr + id) = *(globalUDBlk.addr + id));
        __syncthreads();

        Block globalVassalBlk = graph(rowId, colId);
#pragma unroll 4
        for (int iter = 0; iter < 4 && (id = tid + iter * span) < globalVassalBlk.height * globalVassalBlk.width; iter++)
        {
            // can't use macro here, because here is pragma
            const int i = id / globalVassalBlk.width, j = id % globalVassalBlk.width;
            int res = *(globalVassalBlk.addr + id);
#pragma unroll 64
            for (int k = 0; k < sharedLRBlk.width; k++)
                res = min(res, sharedLRBlk(i, k) + sharedUDBlk(k, j));
            *(globalVassalBlk.addr + id) = res;
        }

        __syncthreads(); // Prevent the next assignment coming too soon
    }
}

#ifdef DUAL_GPU
inline void phase3TaskCopy(int srcDev, int dstDev, Graph graphs[], cudaStream_t stream, int blkId, int beginTask, int endTask)
{
    int n = graphs[0].n, blkFactor = graphs[0].blkFactor;
    int beginId = beginTask + (beginTask >= blkId), endId = endTask + (endTask - 1 >= blkId);
    assert(endId > beginId);
    if (blkId >= beginId && blkId < endId) // Not interfering other stream at the same device
    {
        CHKERR(cudaMemcpyPeerAsync(
            graphs[dstDev](beginId, 0).addr,
            dstDev,
            graphs[srcDev](beginId, 0).addr,
            srcDev,
            (blkId - beginId) * blkFactor * n * sizeof(int),
            stream
        ));
        CHKERR(cudaMemcpyPeerAsync(
            graphs[dstDev](blkId + 1, 0).addr,
            dstDev,
            graphs[srcDev](blkId + 1, 0).addr,
            srcDev,
            (min(n, endId * blkFactor) - (blkId + 1) * blkFactor) * n * sizeof(int),
            stream
        ));
    } else
        CHKERR(cudaMemcpyPeerAsync(
            graphs[dstDev](beginId, 0).addr,
            dstDev,
            graphs[srcDev](beginId, 0).addr,
            srcDev,
            (min(n, endId * blkFactor) - beginId * blkFactor) * n * sizeof(int),
            stream
        ));
}

template <void(*kernel)(Graph,int,int), void(*taskCopy)(int,int,Graph*,cudaStream_t,int,int,int)>
void dispatch(int taskNum, int threadsNum, int sharedSize, Graph graphs[], cudaStream_t streams[], int blkId)
{
    const int DYNAMIC_CHUNK_SIZE = 14 * 1; // 14 = # of SM

#pragma omp parallel for num_threads(2) schedule(dynamic, 1)
    for (int st = 0; st < taskNum; st += DYNAMIC_CHUNK_SIZE)
    {
        int tid = omp_get_thread_num();
        assert(tid == 0 || tid == 1);
        CHKERR(cudaSetDevice(tid));

        int en = min(taskNum, st + DYNAMIC_CHUNK_SIZE);
        kernel<<<en - st, threadsNum, sharedSize, streams[tid]>>>(graphs[tid], blkId, st);
        CHKERR(cudaStreamSynchronize(streams[tid]));
        taskCopy(tid, !tid, graphs, streams[tid + 2], blkId, st, en);
    }
    for (int i = 0; i < GPU_NUM; i++)
    {
        CHKERR(cudaSetDevice(i));
        CHKERR(cudaDeviceSynchronize());
    }
}
#endif // DUAL_GPU

#define div4(x) (((x) >> 2) + bool((x) & 3))

void stage(Graph graphs[], cudaStream_t streams[], int blkId)
{
    int pivotCnt = graphs[0](blkId, blkId).count(), maxCnt = graphs[0].blkFactor * graphs[0].blkFactor;
    int pivotSize = pivotCnt * sizeof(int), maxSize = maxCnt * sizeof(int);

    for (int i = 0; i < GPU_NUM; i++)
    {
        CHKERR(cudaSetDevice(i));
        phase1Kernel<<<1, div4(pivotCnt), pivotSize, streams[i]>>>(graphs[i], blkId);
        if (graphs[0].blkNum > 1)
        {
            phase2Kernel<<<dim3(2, graphs[0].blkNum - 1), div4(maxCnt), pivotSize + maxSize, streams[i]>>>(graphs[i], blkId, 0);
#ifndef DUAL_GPU
            phase3Kernel<<<graphs[0].blkNum - 1, div4(maxCnt), 2 * maxSize, streams[i]>>>(graphs[i], blkId, 0);
#endif
        }
    }
#ifdef DUAL_GPU
    if (graphs[0].blkNum > 1)
        dispatch<phase3Kernel, phase3TaskCopy>(graphs[0].blkNum - 1, div4(maxCnt), 2 * maxSize, graphs, streams, blkId);
#endif
}

int main(const int argc, const char *const *argv)
{
    if (argc != 4)
    {
        puts("Usage: ./<executable name> <input file> <output file> <blocking factor>");
        return 1;
    }
    const char *inName = argv[1], *outName = argv[2];
    const int blkFactor = atoi(argv[3]);

    static char inBuf[1 << 20];
    FILE *inFile = fopen(inName, "r");
    setvbuf(inFile, inBuf, _IOFBF, 1 << 20);
    int n, m;
    fastRead(inFile, n), fastRead(inFile, m);

#ifdef DUAL_GPU
    {
        cudaError_t err;
        CHKERR(cudaSetDevice(0));
        err = cudaDeviceEnablePeerAccess(1, 0), assert(err == cudaSuccess || err == cudaErrorPeerAccessAlreadyEnabled);
        CHKERR(cudaSetDevice(1));
        err = cudaDeviceEnablePeerAccess(0, 0), assert(err == cudaSuccess || err == cudaErrorPeerAccessAlreadyEnabled);
        CHKERR(cudaSetDevice(0));
    }
#endif

    int *hostAddr;
    CHKERR(cudaMallocHost((void**)&hostAddr, n * n * sizeof(int)));
    Graph hostGraph(hostAddr, n, blkFactor);

    int *deviceAddrs[GPU_NUM];
    Graph deviceGraphs[GPU_NUM];
    for (int i = 0; i < GPU_NUM; i++)
    {
        CHKERR(cudaSetDevice(i));
        CHKERR(cudaMalloc((void**)(deviceAddrs + i), n * n * sizeof(int)));
        deviceGraphs[i] = Graph(deviceAddrs[i], n, blkFactor);
    }
    cudaStream_t streams[STREAM_NUM]; // Default stream can't work correctly with cudaMemcpyPeerAsnyc
    for (int i = 0; i < STREAM_NUM; i++)
    {
        CHKERR(cudaSetDevice(i & 1));
        CHKERR(cudaStreamCreate(streams + i));
    }
    CHKERR(cudaSetDevice(0));

    memset(hostAddr, 0x01, n * n * sizeof(int));
    for (int i = 0; i < m; i++)
    {
        int x, y, w;
        fastRead(inFile, x), fastRead(inFile, y), fastRead(inFile, w);
        hostGraph.edge(x - 1, y - 1) = w;
    }
    fclose(inFile);
    for (int i = 0; i < GPU_NUM; i++)
    {
        CHKERR(cudaSetDevice(i));
        hostGraph.copyToDevice(deviceGraphs[i], streams[i]);
    }

    for (int i = 0; i < deviceGraphs[0].blkNum; i++)
        stage(deviceGraphs, streams, i);

    CHKERR(cudaSetDevice(0));
    deviceGraphs[0].copyToHost(hostGraph);
    FILE *outFile = fopen(outName, "w");
    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            int res = i == j ? 0 : hostGraph.edge(i, j);
            if (res == 0x01010101)
                fprintf(outFile, "%s ", "INF");
            else
                fprintf(outFile, "%d ", res);
        }
        fputc('\n', outFile);
    }
    fclose(outFile);
    CHKERR(cudaFreeHost(hostAddr));
    for (int i = 0; i < GPU_NUM; i++)
    {
        CHKERR(cudaSetDevice(i));
        CHKERR(cudaFree(deviceAddrs[i]));
    }
    for (int i = 0; i < STREAM_NUM; i++)
    {
        CHKERR(cudaSetDevice(i & 1));
        CHKERR(cudaStreamDestroy(streams[i]));
    }
    CHKERR(cudaSetDevice(0));
    return 0;
}
	#include <cstdio>
	#include <cstdlib>
	#include <cassert>
	#include <cuda_runtime.h>
	#ifdef DUAL_GPU
	#include <omp.h>
	#endif

	#ifndef NDEBUG
	#define CHKERR(call) assert((call) == cudaSuccess) // no NDEBUG, so assert must be performed
	#else
	#define CHKERR(call) (call)
	#endif

	#ifdef DUAL_GPU
	#define GPU_NUM 2
	#define STREAM_NUM 4
	#else
	#define GPU_NUM 1
	#define STREAM_NUM 1
	#endif

	inline void fastRead(FILE *f, int &x)
	{
	char c;
	x = 0;
	do c = fgetc(f); while (c < '0' \|\| c > '9');
	do x = x * 10 + c - '0', c = fgetc(f); while (c >= '0' && c <= '9');
	}

	/** Part of the graph
	* This class is only a memory address convertor, which doesn't occupy the matrix
	*/
	class Block
	{
	public:
	__host__ __device__ Block(int *_addr, int _height, int _width)
	: addr(_addr), height(_height), width(_width) {}

	__host__ __device__ int &operator()(int i, int j)
	{
	assert(i >= 0 && i < height);
	assert(j >= 0 && j < width);
	return (addr + i width + j);
	}

	int count() const
	{
	return height * width;
	}

	int *const addr;
	const int height, width;
	};

	/** A full graph
	* This class is only a memory address convertor, which doesn't occupy the matrix
	*/
	class Graph
	{
	public:
	Graph()
	: addr(NULL), n(0), blkFactor(0), blkNum(0) {}

	Graph(int *_addr, int _n, int _blkFactor)
	: addr(_addr), n(_n), blkFactor(_blkFactor), blkNum(n / blkFactor + (n % blkFactor > 0)) {}

	__host__ __device__ Block operator()(int i, int j)
	{
	assert(i >= 0 && i < blkNum);
	assert(j >= 0 && j < blkNum);
	// DO NOT delete these two assertions for open interval use
	// This function don't guarantee correctness when out of range
	int height = min(blkFactor, n - i * blkFactor);
	int width = min(blkFactor, n - j * blkFactor);
	return Block(addr + (i * n + j * height) * blkFactor, height, width);
	}

	__host__ __device__ int &edge(int i, int j)
	{
	assert(i >= 0 && i < n);
	assert(j >= 0 && j < n);
	return (*this)(i / blkFactor, j / blkFactor)(i % blkFactor, j % blkFactor);
	}

	void copyToDevice(const Graph &device, cudaStream_t stream)
	{
	CHKERR(cudaMemcpyAsync(device.addr, addr, n * n * sizeof(int), cudaMemcpyHostToDevice, stream));
	}

	void copyToHost(const Graph &host)
	{
	CHKERR(cudaMemcpy(host.addr, addr, n * n * sizeof(int), cudaMemcpyDeviceToHost));
	}

	private:
	int *addr; // cannot be const because this object should be copyable

	public:
	int n, blkFactor, blkNum;
	};

	extern __shared__ int sharedMem[];

	#define quad(id, begin, span, bound, expr) \
	if ((id = (begin)) < bound) { \
	expr; \
	if ((id = (begin) + (span)) < bound) { \
	expr; \
	if ((id = (begin) + 2 * (span)) < bound) { \
	expr; \
	if ((id = (begin) + 3 * (span)) < bound) { \
	expr; \
	} \
	} \
	} \
	}

	__global__ void phase1Kernel(Graph graph, int pivotId)
	{
	Block globalBlk(graph(pivotId, pivotId));
	Block sharedBlk(sharedMem, globalBlk.height, globalBlk.width);
	const int tid = threadIdx.x, span = blockDim.x, bound = sharedBlk.height * sharedBlk.width;
	int id; // placeholder
	quad(id, tid, span, bound, (sharedBlk.addr + id) = (globalBlk.addr + id));
	__syncthreads();

	assert(sharedBlk.width == sharedBlk.height);
	#pragma unroll 64
	for (int k = 0; k < sharedBlk.width; k++)
	{
	int i, j;
	quad(id, tid, span, bound,
	i = id / sharedBlk.width; j = id % sharedBlk.width; // can't use comma here because of the macro
	sharedBlk(i, j) = min(sharedBlk(i, j), sharedBlk(i, k) + sharedBlk(k, j))
	);
	__syncthreads();
	}

	quad(id, tid, span, bound, (globalBlk.addr + id) = (sharedBlk.addr + id));
	}

	__global__ void phase2Kernel(Graph graph, int pivotId, int offset)
	{
	Block globalPivotBlk(graph(pivotId, pivotId));
	int column = blockIdx.x; // 0 = row, 1 = column
	assert(column == 0 \|\| column == 1);
	int vassalId = blockIdx.y + (blockIdx.y >= pivotId);
	Block globalVassalBlk = column ? graph(vassalId, pivotId) : graph(pivotId, vassalId);
	Block sharedPivotBlk(sharedMem, globalPivotBlk.height, globalPivotBlk.width);
	Block sharedVassalBlk(sharedMem + sharedPivotBlk.width * sharedPivotBlk.height, globalVassalBlk.height, globalVassalBlk.width);
	const int tid = threadIdx.x, span = blockDim.x;
	int id; // placeholder
	quad(id, tid, span, sharedPivotBlk.height * sharedPivotBlk.width, (sharedPivotBlk.addr + id) = (globalPivotBlk.addr + id));
	quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width, (sharedVassalBlk.addr + id) = (globalVassalBlk.addr + id));
	__syncthreads();

	assert(sharedPivotBlk.width == sharedPivotBlk.height);
	#pragma unroll 64
	for (int k = 0; k < sharedPivotBlk.width; k++)
	{
	int i, j;
	quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width,
	i = id / sharedVassalBlk.width; j = id % sharedVassalBlk.width;
	sharedVassalBlk(i, j) = min(
	sharedVassalBlk(i, j),
	column ? sharedVassalBlk(i, k) + sharedPivotBlk(k, j) : sharedPivotBlk(i, k) + sharedVassalBlk(k, j)
	);
	);
	__syncthreads();
	}

	quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width, (globalVassalBlk.addr + id) = (sharedVassalBlk.addr + id));
	}

	__global__ void phase3Kernel(Graph graph, int pivotId, int offset)
	{
	const int rowId = blockIdx.x + offset + (blockIdx.x + offset >= pivotId);
	Block globalLRBlk = graph(rowId, pivotId);
	Block sharedLRBlk(sharedMem, globalLRBlk.height, globalLRBlk.width);
	const int tid = threadIdx.x, span = blockDim.x;
	int id; // placeholder
	quad(id, tid, span, sharedLRBlk.height * sharedLRBlk.width, (sharedLRBlk.addr + id) = (globalLRBlk.addr + id));
	for (int colId = !pivotId; colId < graph.blkNum; colId++, colId += (colId == pivotId))
	{
	Block globalUDBlk = graph(pivotId, colId);
	Block sharedUDBlk(sharedMem + sharedLRBlk.width * sharedLRBlk.height, globalUDBlk.height, globalUDBlk.width);
	quad(id, tid, span, sharedUDBlk.height * sharedUDBlk.width, (sharedUDBlk.addr + id) = (globalUDBlk.addr + id));
	__syncthreads();

	Block globalVassalBlk = graph(rowId, colId);
	#pragma unroll 4
	for (int iter = 0; iter < 4 && (id = tid + iter * span) < globalVassalBlk.height * globalVassalBlk.width; iter++)
	{
	// can't use macro here, because here is pragma
	const int i = id / globalVassalBlk.width, j = id % globalVassalBlk.width;
	int res = *(globalVassalBlk.addr + id);
	#pragma unroll 64
	for (int k = 0; k < sharedLRBlk.width; k++)
	res = min(res, sharedLRBlk(i, k) + sharedUDBlk(k, j));
	*(globalVassalBlk.addr + id) = res;
	}

	__syncthreads(); // Prevent the next assignment coming too soon
	}
	}

	#ifdef DUAL_GPU
	inline void phase3TaskCopy(int srcDev, int dstDev, Graph graphs[], cudaStream_t stream, int blkId, int beginTask, int endTask)
	{
	int n = graphs[0].n, blkFactor = graphs[0].blkFactor;
	int beginId = beginTask + (beginTask >= blkId), endId = endTask + (endTask - 1 >= blkId);
	assert(endId > beginId);
	if (blkId >= beginId && blkId < endId) // Not interfering other stream at the same device
	{
	CHKERR(cudaMemcpyPeerAsync(
	graphs[dstDev](beginId, 0).addr,
	dstDev,
	graphs[srcDev](beginId, 0).addr,
	srcDev,
	(blkId - beginId) * blkFactor * n * sizeof(int),
	stream
	));
	CHKERR(cudaMemcpyPeerAsync(
	graphs[dstDev](blkId + 1, 0).addr,
	dstDev,
	graphs[srcDev](blkId + 1, 0).addr,
	srcDev,
	(min(n, endId * blkFactor) - (blkId + 1) * blkFactor) * n * sizeof(int),
	stream
	));
	} else
	CHKERR(cudaMemcpyPeerAsync(
	graphs[dstDev](beginId, 0).addr,
	dstDev,
	graphs[srcDev](beginId, 0).addr,
	srcDev,
	(min(n, endId * blkFactor) - beginId * blkFactor) * n * sizeof(int),
	stream
	));
	}

	template <void(kernel)(Graph,int,int), void(taskCopy)(int,int,Graph*,cudaStream_t,int,int,int)>
	void dispatch(int taskNum, int threadsNum, int sharedSize, Graph graphs[], cudaStream_t streams[], int blkId)
	{
	const int DYNAMIC_CHUNK_SIZE = 14 * 1; // 14 = # of SM

	#pragma omp parallel for num_threads(2) schedule(dynamic, 1)
	for (int st = 0; st < taskNum; st += DYNAMIC_CHUNK_SIZE)
	{
	int tid = omp_get_thread_num();
	assert(tid == 0 \|\| tid == 1);
	CHKERR(cudaSetDevice(tid));

	int en = min(taskNum, st + DYNAMIC_CHUNK_SIZE);
	kernel<<<en - st, threadsNum, sharedSize, streams[tid]>>>(graphs[tid], blkId, st);
	CHKERR(cudaStreamSynchronize(streams[tid]));
	taskCopy(tid, !tid, graphs, streams[tid + 2], blkId, st, en);
	}
	for (int i = 0; i < GPU_NUM; i++)
	{
	CHKERR(cudaSetDevice(i));
	CHKERR(cudaDeviceSynchronize());
	}
	}
	#endif // DUAL_GPU

	#define div4(x) (((x) >> 2) + bool((x) & 3))

	void stage(Graph graphs[], cudaStream_t streams[], int blkId)
	{
	int pivotCnt = graphs[0](blkId, blkId).count(), maxCnt = graphs[0].blkFactor * graphs[0].blkFactor;
	int pivotSize = pivotCnt * sizeof(int), maxSize = maxCnt * sizeof(int);

	for (int i = 0; i < GPU_NUM; i++)
	{
	CHKERR(cudaSetDevice(i));
	phase1Kernel<<<1, div4(pivotCnt), pivotSize, streams[i]>>>(graphs[i], blkId);
	if (graphs[0].blkNum > 1)
	{
	phase2Kernel<<<dim3(2, graphs[0].blkNum - 1), div4(maxCnt), pivotSize + maxSize, streams[i]>>>(graphs[i], blkId, 0);
	#ifndef DUAL_GPU
	phase3Kernel<<<graphs[0].blkNum - 1, div4(maxCnt), 2 * maxSize, streams[i]>>>(graphs[i], blkId, 0);
	#endif
	}
	}
	#ifdef DUAL_GPU
	if (graphs[0].blkNum > 1)
	dispatch<phase3Kernel, phase3TaskCopy>(graphs[0].blkNum - 1, div4(maxCnt), 2 * maxSize, graphs, streams, blkId);
	#endif
	}

	int main(const int argc, const char const argv)
	{
	if (argc != 4)
	{
	puts("Usage: ./<executable name> <input file> <output file> <blocking factor>");
	return 1;
	}
	const char inName = argv[1], outName = argv[2];
	const int blkFactor = atoi(argv[3]);

	static char inBuf[1 << 20];
	FILE *inFile = fopen(inName, "r");
	setvbuf(inFile, inBuf, _IOFBF, 1 << 20);
	int n, m;
	fastRead(inFile, n), fastRead(inFile, m);

	#ifdef DUAL_GPU
	{
	cudaError_t err;
	CHKERR(cudaSetDevice(0));
	err = cudaDeviceEnablePeerAccess(1, 0), assert(err == cudaSuccess \|\| err == cudaErrorPeerAccessAlreadyEnabled);
	CHKERR(cudaSetDevice(1));
	err = cudaDeviceEnablePeerAccess(0, 0), assert(err == cudaSuccess \|\| err == cudaErrorPeerAccessAlreadyEnabled);
	CHKERR(cudaSetDevice(0));
	}
	#endif

	int *hostAddr;
	CHKERR(cudaMallocHost((void*)&hostAddr, n n * sizeof(int)));
	Graph hostGraph(hostAddr, n, blkFactor);

	int *deviceAddrs[GPU_NUM];
	Graph deviceGraphs[GPU_NUM];
	for (int i = 0; i < GPU_NUM; i++)
	{
	CHKERR(cudaSetDevice(i));
	CHKERR(cudaMalloc((void*)(deviceAddrs + i), n n * sizeof(int)));
	deviceGraphs[i] = Graph(deviceAddrs[i], n, blkFactor);
	}
	cudaStream_t streams[STREAM_NUM]; // Default stream can't work correctly with cudaMemcpyPeerAsnyc
	for (int i = 0; i < STREAM_NUM; i++)
	{
	CHKERR(cudaSetDevice(i & 1));
	CHKERR(cudaStreamCreate(streams + i));
	}
	CHKERR(cudaSetDevice(0));

	memset(hostAddr, 0x01, n * n * sizeof(int));
	for (int i = 0; i < m; i++)
	{
	int x, y, w;
	fastRead(inFile, x), fastRead(inFile, y), fastRead(inFile, w);
	hostGraph.edge(x - 1, y - 1) = w;
	}
	fclose(inFile);
	for (int i = 0; i < GPU_NUM; i++)
	{
	CHKERR(cudaSetDevice(i));
	hostGraph.copyToDevice(deviceGraphs[i], streams[i]);
	}

	for (int i = 0; i < deviceGraphs[0].blkNum; i++)
	stage(deviceGraphs, streams, i);

	CHKERR(cudaSetDevice(0));
	deviceGraphs[0].copyToHost(hostGraph);
	FILE *outFile = fopen(outName, "w");
	for (int i = 0; i < n; i++)
	{
	for (int j = 0; j < n; j++)
	{
	int res = i == j ? 0 : hostGraph.edge(i, j);
	if (res == 0x01010101)
	fprintf(outFile, "%s ", "INF");
	else
	fprintf(outFile, "%d ", res);
	}
	fputc('\n', outFile);
	}
	fclose(outFile);
	CHKERR(cudaFreeHost(hostAddr));
	for (int i = 0; i < GPU_NUM; i++)
	{
	CHKERR(cudaSetDevice(i));
	CHKERR(cudaFree(deviceAddrs[i]));
	}
	for (int i = 0; i < STREAM_NUM; i++)
	{
	CHKERR(cudaSetDevice(i & 1));
	CHKERR(cudaStreamDestroy(streams[i]));
	}
	CHKERR(cudaSetDevice(0));
	return 0;
	}