Skip to content

Instantly share code, notes, and snippets.

@deckar01
Created September 23, 2021 18:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deckar01/a0df85056f2635677187b011000d84ed to your computer and use it in GitHub Desktop.
Save deckar01/a0df85056f2635677187b011000d84ed to your computer and use it in GitHub Desktop.
[AZSPCS] Monochromatic Triangles: CUDA Simulated Annealing
#include <iostream>
#include <iomanip>
#include <math.h>
#include <curand.h>
#include <curand_kernel.h>
#define M (N * (N - 1) / 2)
#define index (blockDim.x * blockIdx.x + threadIdx.x)
#define offset (M * index)
#define roll(n) (curand(&rng[index]) % (n))
#define TRI(x, y) ((y * (y - 1) / 2) + x)
#define TRI_O(x, y) (x < y ? TRI(x, y) : TRI(y, x))
typedef uint8_t color;
__global__
void init(int N, int *counts, color *edges, curandState_t *rng, time_t seed)
{
// Randomly assign edge colors.
curand_init(seed, index, 0, &rng[index]);
for (int i = 0; i < M; i++)
edges[offset + i] = roll(3);
// Count the monochromatic triangles.
counts[index] = 0;
for (int x = 0; x < N; x++) {
for (int y = x + 1; y < N; y++) {
color a = edges[offset + TRI(x, y)];
for (int z = y + 1; z < N; z++) {
color b = edges[offset + TRI(y, z)];
color c = edges[offset + TRI(x, z)];
if (a == b && b == c) counts[index]++;
}
}
}
}
__global__
void search(int N, int LIMIT, int *counts, color *edges, curandState_t *rng)
{
// Simulate annealing (gradient descent + random walk).
for (int t = 0; t < LIMIT; t++) {
// Select an edge randomly.
int x = roll(N);
int y = roll(N - 1);
if (y >= x) y++;
color a = edges[offset + TRI_O(x, y)];
// Select a neighboring color randomly.
color neighbor = (a + (roll(2) + 1)) % 3;
// Compute the score with the new color.
int neighbor_score = counts[index];
for (int z = 0; z < N; z++) {
if (z == x || z == y) continue;
// Get the adjacent edges.
color b = edges[offset + TRI_O(z, y)];
color c = edges[offset + TRI_O(z, x)];
// Check for an unaffected non-monochromatic triangle.
if (b != c) continue;
// Check for breaking a monochromatic triangle.
if (a == b) {
if (neighbor != b) neighbor_score--;
}
// Check for creating a monochromatic triangle.
else {
if (neighbor == b) neighbor_score++;
}
}
// Always keep improvements
if (neighbor_score < counts[index]) {
counts[index] = neighbor_score;
edges[offset + TRI_O(x, y)] = neighbor;
// OPTIMIZATION: Don't cache local minima.
// O(M) copies are expensive and mostly go unused.
// The final state will converge to the global minima.
}
// Keep non-improvements randomly.
else {
// Cooling schedule heuristic.
double T = 10 * (1.0f - (double)t / LIMIT);
double delta = counts[index] - neighbor_score;
double bar = curand_uniform_double(&rng[index]);
// Weighted by an exponential cooling schedule.
if (exp(delta / T) >= bar) {
counts[index] = neighbor_score;
edges[offset + TRI_O(x, y)] = neighbor;
}
}
}
}
int main(int argc, char *argv[])
{
// Graph size.
int N = atoi(argv[1]);
// Simulated annealing iteration limit heuristic.
int LIMIT = 220 * M;
// RTX 3090's CUDA core count.
int nodes = 10496;
// Optimal block size for this kernel for small N.
int block_size = 128;
int blocks = nodes / block_size;
// Print a summary of the current search.
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
std::cout << "N: " << N << std::endl;
std::cout << "Device: " << prop.name << std::endl;
std::cout << "Blocks: " << blocks << std::endl;
std::cout << "Threads per Block: " << block_size << std::endl;
std::cout << "Total Threads: " << nodes << std::endl << std::endl;
// Allocate memory.
int *counts;
color *edges;
curandState_t *rng;
// Memory that needs to be read by the host.
cudaMallocManaged(&counts, nodes * sizeof(int));
cudaMallocManaged(&edges, nodes * M * sizeof(color));
// Memory that only the device uses.
cudaMalloc(&rng, nodes * sizeof(curandState_t));
// Setup GPU timers.
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Randomize the starting graphs.
cudaEventRecord(start);
init <<<blocks, block_size >>> (N, counts, edges, rng, time(NULL));
cudaEventRecord(stop);
cudaDeviceSynchronize();
float t;
cudaEventElapsedTime(&t, start, stop);
float triangle_speed = (float)nodes * (M * (N - 2) / 2) / (t * 1e-3);
std::cout << std::scientific << std::setprecision(1);
std::cout << "Setup :: ";
std::cout << "Ts:" << triangle_speed << std::endl;
std::cout << "----" << std::endl << std::endl;
// Search until all monochromatic triangles are eliminated.
int minima = INT_MAX;
while (minima > 0) {
// Saturate the GPU with simulated annealing search threads.
cudaEventRecord(start);
search <<<blocks, block_size>>> (N, LIMIT, counts, edges, rng);
cudaEventRecord(stop);
cudaDeviceSynchronize();
// Calculate the execution time and speed.
cudaEventElapsedTime(&t, start, stop);
triangle_speed = (float)nodes * LIMIT * N / (t * 1e-3);
// Find the best improvement in the batch.
int improvement = -1;
for (int b = 0; b < nodes; b++) {
if (counts[b] < minima) {
improvement = b;
minima = counts[b];
}
}
// Overwrite the last monitor.
std::cout << " \r";
// Report improvements after each batch.
if (improvement >= 0) {
for (int i = 1; i < N; i++) {
for (int j = 0; j < i; j++)
std::cout << (int)edges[improvement * M + TRI(j, i)];
if (i < N - 1) std::cout << ",";
}
std::cout << std::endl << std::endl;
std::cout << minima << " :: ";
}
// Monitor the limit and processing speed.
std::cout << "L:" << (float)LIMIT << ", ";
std::cout << "T/s:" << triangle_speed << " ";
// Start a new monitor after a report.
if (improvement >= 0) {
std::cout << std::endl << "----" << std::endl << std::endl;
}
std::cout << std::flush;
// Slowly search deeper into the solution space.
LIMIT += (LIMIT * 0.05 + 1);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment