CraigglesO/bf.cu

## bf.cu
// A C++ program for Bellman-Ford's single source
// shortest path algorithm.
// nvcc -std=c++11 BellmanFord.cu  -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__
#include <stdio.h>
#include <climits>
#include <random>
#include <chrono>
#include <ctime>

using namespace std;
using namespace std::chrono;

// a structure to represent a weighted edge in graph
struct Edge {
  int src, dest, weight;
};

// A utility function used to print the solution
void printArr(int dist[], int n) {
  printf("Vertex \t\t Distance from Source\n");
  for (int i = 0; i < n; ++i)
    printf("%d \t\t %d\n", i, dist[i]);
}

void BellmanFord_CPU(int V, int E, struct Edge* edges, int *dist, int src) {

  // Step 1: Initialize distances from src to all other vertices
  // as INFINITE
  for (int i = 0; i < V; i++)
    dist[i] = INT_MAX;
  dist[src] = 0;

  // Step 2: Relax all edges |V| - 1 times. A simple shortest
  // path from src to any other vertex can have at-most |V| - 1
  // edges
  for (int i = 1; i <= V-1; i++) {
    for (int j = 0; j < E; j++) {
      int u = edges[j].src;
      int v = edges[j].dest;
      int weight = edges[j].weight;
      if (dist[u] != INT_MAX && dist[u] + weight < dist[v])
        dist[v] = dist[u] + weight;
    }
  }
  return;
}

// The main function that finds shortest distances from src to
// all other vertices using Bellman-Ford algorithm.  The function
// also detects negative weight cycle
__global__ void BellmanFord(int V, int E, struct Edge* edges, int *dist, int src) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  // Step 1: Initialize distances from src to all other vertices
  // as INFINITE. While a little wasteful, V will always be smaller then E
  for (int i = index; i < V; i += stride)
    dist[i] = INT_MAX;
  dist[src] = 0;

  // Step 2: Relax all edges |V| - 1 times. A simple shortest
  // path from src to any other vertex can have at-most |V| - 1
  // edges
  for (int i = 1; i <= V - 1; i++) {
    for (int j = index; j < E; j += stride) {
      int u      = edges[j].src;
      int v      = edges[j].dest;
      int weight = edges[j].weight;
      // NOTE: The issue here is that two threads might try to write to the
      // same memory allocation (at the same time). To avoid this we should use atomics.
      if (dist[u] != INT_MAX && dist[u] + weight < dist[v])
        atomicMin(&dist[v], dist[u] + weight);
    }
    __syncthreads();
  }

  return;
}

// Driver program to test above functions
int main() {
  cudaEvent_t start, stop;
  float time;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  /* Let us create the graph given in above example */
  int V = 10000;  // Number of vertices in graph
  int E = 35000;  // Number of edges in graph
  int *dist_CPU, *dist;
  Edge *edges;
  edges    = (Edge*)malloc(E*sizeof(Edge));
  dist_CPU = (int*)malloc(V*sizeof(int));
  dist     = (int*)malloc(V*sizeof(int));

  Edge *d_edges;
  int *d_dist;

  // allocate memory
  cudaMalloc(&d_edges, E * sizeof(Edge));
  cudaMalloc(&d_dist, V * sizeof(int));

  // ensure at least 1 connection
  edges[0].src = 0;
  edges[0].dest = 2;
  edges[0].weight = 1;

  for (int j = 1; j < E; j++) {
    edges[j].src    = rand()%(V) + 0;
    edges[j].dest   = rand()%(V) + 0;
    edges[j].weight = rand()%(50 - 1 + 1) + 1;
    // printf("j: %d - src: %d - dest: %d - weight: %d\n", j, edges[j].src, edges[j].dest, edges[j].weight);
  }

  /** CPU VERSION START **/
  high_resolution_clock::time_point cpu_start = high_resolution_clock::now();
  BellmanFord_CPU(V, E, edges, dist_CPU, 0);
  high_resolution_clock::time_point cpu_stop = high_resolution_clock::now();
  long cpu_duration = duration_cast<milliseconds>( cpu_stop - cpu_start ).count();
  printf("CPU time: %ld\n", cpu_duration);
  /** CPU VERSION END **/

  // setup threads
  int blockSize = 256;
  int numBlocks = (E + blockSize - 1) / blockSize;

  // copy over to device
  cudaMemcpy(d_edges, edges, E*sizeof(Edge), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dist, dist, V*sizeof(int), cudaMemcpyHostToDevice);

  cudaEventRecord(start, 0);
  BellmanFord<<<numBlocks, blockSize>>>(V, E, d_edges, d_dist, 0);
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);

  // get result back to host
  cudaMemcpy(dist, d_dist, V*sizeof(int), cudaMemcpyDeviceToHost);

  // COMPARE:
  // printArr(dist, V);
  // printArr(dist_CPU, V);
  int x = 0;
  for (int i = 0; i < V; i++) {
    if (dist[i] != dist_CPU[i]) {
      // printf("dist[%i]: %i     ", i, dist[i]);
      // printf("dist_CPU[%i]: %i\n", i, dist_CPU[i]);
      x++;
    }
  }
  printf("number of differences: %i\n", x);

  // cleanup
  cudaFree(d_edges);
  cudaFree(d_dist);
  free(dist_CPU);
  free(dist);
  free(edges);

  cudaEventElapsedTime(&time, start, stop);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
  printf("Time for the kernel: %f ms\n", time);

  return 0;
}
	// A C++ program for Bellman-Ford's single source
	// shortest path algorithm.
	// nvcc -std=c++11 BellmanFord.cu -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__
	#include <stdio.h>
	#include <climits>
	#include <random>
	#include <chrono>
	#include <ctime>

	using namespace std;
	using namespace std::chrono;

	// a structure to represent a weighted edge in graph
	struct Edge {
	int src, dest, weight;
	};

	// A utility function used to print the solution
	void printArr(int dist[], int n) {
	printf("Vertex \t\t Distance from Source\n");
	for (int i = 0; i < n; ++i)
	printf("%d \t\t %d\n", i, dist[i]);
	}

	void BellmanFord_CPU(int V, int E, struct Edge* edges, int *dist, int src) {

	// Step 1: Initialize distances from src to all other vertices
	// as INFINITE
	for (int i = 0; i < V; i++)
	dist[i] = INT_MAX;
	dist[src] = 0;

	// Step 2: Relax all edges \|V\| - 1 times. A simple shortest
	// path from src to any other vertex can have at-most \|V\| - 1
	// edges
	for (int i = 1; i <= V-1; i++) {
	for (int j = 0; j < E; j++) {
	int u = edges[j].src;
	int v = edges[j].dest;
	int weight = edges[j].weight;
	if (dist[u] != INT_MAX && dist[u] + weight < dist[v])
	dist[v] = dist[u] + weight;
	}
	}
	return;
	}

	// The main function that finds shortest distances from src to
	// all other vertices using Bellman-Ford algorithm. The function
	// also detects negative weight cycle
	__global__ void BellmanFord(int V, int E, struct Edge* edges, int *dist, int src) {
	int index = blockIdx.x * blockDim.x + threadIdx.x;
	int stride = blockDim.x * gridDim.x;
	// Step 1: Initialize distances from src to all other vertices
	// as INFINITE. While a little wasteful, V will always be smaller then E
	for (int i = index; i < V; i += stride)
	dist[i] = INT_MAX;
	dist[src] = 0;

	// Step 2: Relax all edges \|V\| - 1 times. A simple shortest
	// path from src to any other vertex can have at-most \|V\| - 1
	// edges
	for (int i = 1; i <= V - 1; i++) {
	for (int j = index; j < E; j += stride) {
	int u = edges[j].src;
	int v = edges[j].dest;
	int weight = edges[j].weight;
	// NOTE: The issue here is that two threads might try to write to the
	// same memory allocation (at the same time). To avoid this we should use atomics.
	if (dist[u] != INT_MAX && dist[u] + weight < dist[v])
	atomicMin(&dist[v], dist[u] + weight);
	}
	__syncthreads();
	}

	return;
	}

	// Driver program to test above functions
	int main() {
	cudaEvent_t start, stop;
	float time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	/* Let us create the graph given in above example */
	int V = 10000; // Number of vertices in graph
	int E = 35000; // Number of edges in graph
	int dist_CPU, dist;
	Edge *edges;
	edges = (Edge)malloc(Esizeof(Edge));
	dist_CPU = (int)malloc(Vsizeof(int));
	dist = (int)malloc(Vsizeof(int));

	Edge *d_edges;
	int *d_dist;

	// allocate memory
	cudaMalloc(&d_edges, E * sizeof(Edge));
	cudaMalloc(&d_dist, V * sizeof(int));

	// ensure at least 1 connection
	edges[0].src = 0;
	edges[0].dest = 2;
	edges[0].weight = 1;

	for (int j = 1; j < E; j++) {
	edges[j].src = rand()%(V) + 0;
	edges[j].dest = rand()%(V) + 0;
	edges[j].weight = rand()%(50 - 1 + 1) + 1;
	// printf("j: %d - src: %d - dest: %d - weight: %d\n", j, edges[j].src, edges[j].dest, edges[j].weight);
	}

	/ CPU VERSION START /
	high_resolution_clock::time_point cpu_start = high_resolution_clock::now();
	BellmanFord_CPU(V, E, edges, dist_CPU, 0);
	high_resolution_clock::time_point cpu_stop = high_resolution_clock::now();
	long cpu_duration = duration_cast<milliseconds>( cpu_stop - cpu_start ).count();
	printf("CPU time: %ld\n", cpu_duration);
	/ CPU VERSION END /

	// setup threads
	int blockSize = 256;
	int numBlocks = (E + blockSize - 1) / blockSize;

	// copy over to device
	cudaMemcpy(d_edges, edges, E*sizeof(Edge), cudaMemcpyHostToDevice);
	cudaMemcpy(d_dist, dist, V*sizeof(int), cudaMemcpyHostToDevice);

	cudaEventRecord(start, 0);
	BellmanFord<<<numBlocks, blockSize>>>(V, E, d_edges, d_dist, 0);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	// get result back to host
	cudaMemcpy(dist, d_dist, V*sizeof(int), cudaMemcpyDeviceToHost);

	// COMPARE:
	// printArr(dist, V);
	// printArr(dist_CPU, V);
	int x = 0;
	for (int i = 0; i < V; i++) {
	if (dist[i] != dist_CPU[i]) {
	// printf("dist[%i]: %i ", i, dist[i]);
	// printf("dist_CPU[%i]: %i\n", i, dist_CPU[i]);
	x++;
	}
	}
	printf("number of differences: %i\n", x);

	// cleanup
	cudaFree(d_edges);
	cudaFree(d_dist);
	free(dist_CPU);
	free(dist);
	free(edges);

	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	printf("Time for the kernel: %f ms\n", time);

	return 0;
	}