Skip to content

Instantly share code, notes, and snippets.

@CraigglesO

CraigglesO/bf.cu

Created Feb 2, 2018
Embed
What would you like to do?
// A C++ program for Bellman-Ford's single source
// shortest path algorithm.
// nvcc -std=c++11 BellmanFord.cu -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__
#include <stdio.h>
#include <climits>
#include <random>
#include <chrono>
#include <ctime>
using namespace std;
using namespace std::chrono;
// a structure to represent a weighted edge in graph
struct Edge {
int src, dest, weight;
};
// A utility function used to print the solution
void printArr(int dist[], int n) {
printf("Vertex \t\t Distance from Source\n");
for (int i = 0; i < n; ++i)
printf("%d \t\t %d\n", i, dist[i]);
}
void BellmanFord_CPU(int V, int E, struct Edge* edges, int *dist, int src) {
// Step 1: Initialize distances from src to all other vertices
// as INFINITE
for (int i = 0; i < V; i++)
dist[i] = INT_MAX;
dist[src] = 0;
// Step 2: Relax all edges |V| - 1 times. A simple shortest
// path from src to any other vertex can have at-most |V| - 1
// edges
for (int i = 1; i <= V-1; i++) {
for (int j = 0; j < E; j++) {
int u = edges[j].src;
int v = edges[j].dest;
int weight = edges[j].weight;
if (dist[u] != INT_MAX && dist[u] + weight < dist[v])
dist[v] = dist[u] + weight;
}
}
return;
}
// The main function that finds shortest distances from src to
// all other vertices using Bellman-Ford algorithm. The function
// also detects negative weight cycle
__global__ void BellmanFord(int V, int E, struct Edge* edges, int *dist, int src) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// Step 1: Initialize distances from src to all other vertices
// as INFINITE. While a little wasteful, V will always be smaller then E
for (int i = index; i < V; i += stride)
dist[i] = INT_MAX;
dist[src] = 0;
// Step 2: Relax all edges |V| - 1 times. A simple shortest
// path from src to any other vertex can have at-most |V| - 1
// edges
for (int i = 1; i <= V - 1; i++) {
for (int j = index; j < E; j += stride) {
int u = edges[j].src;
int v = edges[j].dest;
int weight = edges[j].weight;
// NOTE: The issue here is that two threads might try to write to the
// same memory allocation (at the same time). To avoid this we should use atomics.
if (dist[u] != INT_MAX && dist[u] + weight < dist[v])
atomicMin(&dist[v], dist[u] + weight);
}
__syncthreads();
}
return;
}
// Driver program to test above functions
int main() {
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/* Let us create the graph given in above example */
int V = 10000; // Number of vertices in graph
int E = 35000; // Number of edges in graph
int *dist_CPU, *dist;
Edge *edges;
edges = (Edge*)malloc(E*sizeof(Edge));
dist_CPU = (int*)malloc(V*sizeof(int));
dist = (int*)malloc(V*sizeof(int));
Edge *d_edges;
int *d_dist;
// allocate memory
cudaMalloc(&d_edges, E * sizeof(Edge));
cudaMalloc(&d_dist, V * sizeof(int));
// ensure at least 1 connection
edges[0].src = 0;
edges[0].dest = 2;
edges[0].weight = 1;
for (int j = 1; j < E; j++) {
edges[j].src = rand()%(V) + 0;
edges[j].dest = rand()%(V) + 0;
edges[j].weight = rand()%(50 - 1 + 1) + 1;
// printf("j: %d - src: %d - dest: %d - weight: %d\n", j, edges[j].src, edges[j].dest, edges[j].weight);
}
/** CPU VERSION START **/
high_resolution_clock::time_point cpu_start = high_resolution_clock::now();
BellmanFord_CPU(V, E, edges, dist_CPU, 0);
high_resolution_clock::time_point cpu_stop = high_resolution_clock::now();
long cpu_duration = duration_cast<milliseconds>( cpu_stop - cpu_start ).count();
printf("CPU time: %ld\n", cpu_duration);
/** CPU VERSION END **/
// setup threads
int blockSize = 256;
int numBlocks = (E + blockSize - 1) / blockSize;
// copy over to device
cudaMemcpy(d_edges, edges, E*sizeof(Edge), cudaMemcpyHostToDevice);
cudaMemcpy(d_dist, dist, V*sizeof(int), cudaMemcpyHostToDevice);
cudaEventRecord(start, 0);
BellmanFord<<<numBlocks, blockSize>>>(V, E, d_edges, d_dist, 0);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
// get result back to host
cudaMemcpy(dist, d_dist, V*sizeof(int), cudaMemcpyDeviceToHost);
// COMPARE:
// printArr(dist, V);
// printArr(dist_CPU, V);
int x = 0;
for (int i = 0; i < V; i++) {
if (dist[i] != dist_CPU[i]) {
// printf("dist[%i]: %i ", i, dist[i]);
// printf("dist_CPU[%i]: %i\n", i, dist_CPU[i]);
x++;
}
}
printf("number of differences: %i\n", x);
// cleanup
cudaFree(d_edges);
cudaFree(d_dist);
free(dist_CPU);
free(dist);
free(edges);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Time for the kernel: %f ms\n", time);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.