deckar01/mono-tri-sim-anneal.cu

## mono-tri-sim-anneal.cu
#include <iostream>
#include <iomanip>
#include <math.h>
#include <curand.h>
#include <curand_kernel.h>

#define M (N * (N - 1) / 2)
#define index (blockDim.x * blockIdx.x + threadIdx.x)
#define offset (M * index)
#define roll(n) (curand(&rng[index]) % (n))
#define TRI(x, y) ((y * (y - 1) / 2) + x)
#define TRI_O(x, y) (x < y ? TRI(x, y) : TRI(y, x))

typedef uint8_t color;

__global__
void init(int N, int *counts, color *edges, curandState_t *rng, time_t seed)
{
    // Randomly assign edge colors.
    curand_init(seed, index, 0, &rng[index]);
    for (int i = 0; i < M; i++)
    edges[offset + i] = roll(3);

    // Count the monochromatic triangles.
    counts[index] = 0;
    for (int x = 0; x < N; x++) {
        for (int y = x + 1; y < N; y++) {
            color a = edges[offset + TRI(x, y)];
            for (int z = y + 1; z < N; z++) {
                color b = edges[offset + TRI(y, z)];
                color c = edges[offset + TRI(x, z)];
                if (a == b && b == c) counts[index]++;
            }
        }
    }
}

__global__
void search(int N, int LIMIT, int *counts, color *edges, curandState_t *rng)
{
    // Simulate annealing (gradient descent + random walk).
    for (int t = 0; t < LIMIT; t++) {
        // Select an edge randomly.
        int x = roll(N);
        int y = roll(N - 1);
        if (y >= x) y++;
        color a = edges[offset + TRI_O(x, y)];
        // Select a neighboring color randomly.
        color neighbor = (a + (roll(2) + 1)) % 3;
        // Compute the score with the new color.
        int neighbor_score = counts[index];
        for (int z = 0; z < N; z++) {
            if (z == x || z == y) continue;
            // Get the adjacent edges.
            color b = edges[offset + TRI_O(z, y)];
            color c = edges[offset + TRI_O(z, x)];
            // Check for an unaffected non-monochromatic triangle.
            if (b != c) continue;
            // Check for breaking a monochromatic triangle.
            if (a == b) {
                if (neighbor != b) neighbor_score--;
            }
            // Check for creating a monochromatic triangle.
            else {
                if (neighbor == b) neighbor_score++;
            }
        }
        // Always keep improvements
        if (neighbor_score < counts[index]) {
            counts[index] = neighbor_score;
            edges[offset + TRI_O(x, y)] = neighbor;
            // OPTIMIZATION: Don't cache local minima.
            // O(M) copies are expensive and mostly go unused.
            // The final state will converge to the global minima.
        }
        // Keep non-improvements randomly.
        else {
            // Cooling schedule heuristic.
            double T = 10 * (1.0f - (double)t / LIMIT);
            double delta = counts[index] - neighbor_score;
            double bar = curand_uniform_double(&rng[index]);
            // Weighted by an exponential cooling schedule.
            if (exp(delta / T) >= bar) {
                counts[index] = neighbor_score;
                edges[offset + TRI_O(x, y)] = neighbor;
            }
        }
    }
}

int main(int argc, char *argv[])
{
    // Graph size.
    int N = atoi(argv[1]);
    // Simulated annealing iteration limit heuristic.
    int LIMIT = 220 * M;

    // RTX 3090's CUDA core count.
    int nodes = 10496;
    // Optimal block size for this kernel for small N.
    int block_size = 128;
    int blocks = nodes / block_size;

    // Print a summary of the current search.
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    std::cout << "N: " << N << std::endl;
    std::cout << "Device: " << prop.name << std::endl;
    std::cout << "Blocks: " << blocks << std::endl;
    std::cout << "Threads per Block: " << block_size << std::endl;
    std::cout << "Total Threads: " << nodes << std::endl << std::endl;

    // Allocate memory.
    int *counts;
    color *edges;
    curandState_t *rng;
    // Memory that needs to be read by the host.
    cudaMallocManaged(&counts, nodes * sizeof(int));
    cudaMallocManaged(&edges, nodes * M * sizeof(color));
    // Memory that only the device uses.
    cudaMalloc(&rng, nodes * sizeof(curandState_t));

    // Setup GPU timers.
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Randomize the starting graphs.
    cudaEventRecord(start);
    init <<<blocks, block_size >>> (N, counts, edges, rng, time(NULL));
    cudaEventRecord(stop);
    cudaDeviceSynchronize();

    float t;
    cudaEventElapsedTime(&t, start, stop);
    float triangle_speed = (float)nodes * (M * (N - 2) / 2) / (t * 1e-3);
    std::cout << std::scientific << std::setprecision(1);
    std::cout << "Setup :: ";
    std::cout << "Ts:" << triangle_speed << std::endl;
    std::cout << "----" << std::endl << std::endl;

    // Search until all monochromatic triangles are eliminated.
    int minima = INT_MAX;
    while (minima > 0) {
        // Saturate the GPU with simulated annealing search threads.
        cudaEventRecord(start);
        search <<<blocks, block_size>>> (N, LIMIT, counts, edges, rng);
        cudaEventRecord(stop);
        cudaDeviceSynchronize();

        // Calculate the execution time and speed.
        cudaEventElapsedTime(&t, start, stop);
        triangle_speed = (float)nodes * LIMIT * N / (t * 1e-3);

        // Find the best improvement in the batch.
        int improvement = -1;
        for (int b = 0; b < nodes; b++) {
            if (counts[b] < minima) {
                improvement = b;
                minima = counts[b];
            }
        }

        // Overwrite the last monitor.
        std::cout << "                   \r";

        // Report improvements after each batch.
        if (improvement >= 0) {
            for (int i = 1; i < N; i++) {
                for (int j = 0; j < i; j++)
                    std::cout << (int)edges[improvement * M + TRI(j, i)];
                if (i < N - 1) std::cout << ",";
            }
            std::cout << std::endl << std::endl;
            std::cout << minima << " :: ";
        }

        // Monitor the limit and processing speed.
        std::cout << "L:" << (float)LIMIT << ", ";
        std::cout << "T/s:" << triangle_speed << " ";

        // Start a new monitor after a report.
        if (improvement >= 0) {
            std::cout << std::endl << "----" << std::endl << std::endl;
        }
        std::cout << std::flush;

        // Slowly search deeper into the solution space.
        LIMIT += (LIMIT * 0.05 + 1);
    }

    return 0;
}
	#include <iostream>
	#include <iomanip>
	#include <math.h>
	#include <curand.h>
	#include <curand_kernel.h>

	#define M (N * (N - 1) / 2)
	#define index (blockDim.x * blockIdx.x + threadIdx.x)
	#define offset (M * index)
	#define roll(n) (curand(&rng[index]) % (n))
	#define TRI(x, y) ((y * (y - 1) / 2) + x)
	#define TRI_O(x, y) (x < y ? TRI(x, y) : TRI(y, x))

	typedef uint8_t color;

	__global__
	void init(int N, int counts, color edges, curandState_t *rng, time_t seed)
	{
	// Randomly assign edge colors.
	curand_init(seed, index, 0, &rng[index]);
	for (int i = 0; i < M; i++)
	edges[offset + i] = roll(3);

	// Count the monochromatic triangles.
	counts[index] = 0;
	for (int x = 0; x < N; x++) {
	for (int y = x + 1; y < N; y++) {
	color a = edges[offset + TRI(x, y)];
	for (int z = y + 1; z < N; z++) {
	color b = edges[offset + TRI(y, z)];
	color c = edges[offset + TRI(x, z)];
	if (a == b && b == c) counts[index]++;
	}
	}
	}
	}

	__global__
	void search(int N, int LIMIT, int counts, color edges, curandState_t *rng)
	{
	// Simulate annealing (gradient descent + random walk).
	for (int t = 0; t < LIMIT; t++) {
	// Select an edge randomly.
	int x = roll(N);
	int y = roll(N - 1);
	if (y >= x) y++;
	color a = edges[offset + TRI_O(x, y)];
	// Select a neighboring color randomly.
	color neighbor = (a + (roll(2) + 1)) % 3;
	// Compute the score with the new color.
	int neighbor_score = counts[index];
	for (int z = 0; z < N; z++) {
	if (z == x \|\| z == y) continue;
	// Get the adjacent edges.
	color b = edges[offset + TRI_O(z, y)];
	color c = edges[offset + TRI_O(z, x)];
	// Check for an unaffected non-monochromatic triangle.
	if (b != c) continue;
	// Check for breaking a monochromatic triangle.
	if (a == b) {
	if (neighbor != b) neighbor_score--;
	}
	// Check for creating a monochromatic triangle.
	else {
	if (neighbor == b) neighbor_score++;
	}
	}
	// Always keep improvements
	if (neighbor_score < counts[index]) {
	counts[index] = neighbor_score;
	edges[offset + TRI_O(x, y)] = neighbor;
	// OPTIMIZATION: Don't cache local minima.
	// O(M) copies are expensive and mostly go unused.
	// The final state will converge to the global minima.
	}
	// Keep non-improvements randomly.
	else {
	// Cooling schedule heuristic.
	double T = 10 * (1.0f - (double)t / LIMIT);
	double delta = counts[index] - neighbor_score;
	double bar = curand_uniform_double(&rng[index]);
	// Weighted by an exponential cooling schedule.
	if (exp(delta / T) >= bar) {
	counts[index] = neighbor_score;
	edges[offset + TRI_O(x, y)] = neighbor;
	}
	}
	}
	}

	int main(int argc, char *argv[])
	{
	// Graph size.
	int N = atoi(argv[1]);
	// Simulated annealing iteration limit heuristic.
	int LIMIT = 220 * M;

	// RTX 3090's CUDA core count.
	int nodes = 10496;
	// Optimal block size for this kernel for small N.
	int block_size = 128;
	int blocks = nodes / block_size;

	// Print a summary of the current search.
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, 0);
	std::cout << "N: " << N << std::endl;
	std::cout << "Device: " << prop.name << std::endl;
	std::cout << "Blocks: " << blocks << std::endl;
	std::cout << "Threads per Block: " << block_size << std::endl;
	std::cout << "Total Threads: " << nodes << std::endl << std::endl;

	// Allocate memory.
	int *counts;
	color *edges;
	curandState_t *rng;
	// Memory that needs to be read by the host.
	cudaMallocManaged(&counts, nodes * sizeof(int));
	cudaMallocManaged(&edges, nodes * M * sizeof(color));
	// Memory that only the device uses.
	cudaMalloc(&rng, nodes * sizeof(curandState_t));

	// Setup GPU timers.
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// Randomize the starting graphs.
	cudaEventRecord(start);
	init <<<blocks, block_size >>> (N, counts, edges, rng, time(NULL));
	cudaEventRecord(stop);
	cudaDeviceSynchronize();

	float t;
	cudaEventElapsedTime(&t, start, stop);
	float triangle_speed = (float)nodes * (M * (N - 2) / 2) / (t * 1e-3);
	std::cout << std::scientific << std::setprecision(1);
	std::cout << "Setup :: ";
	std::cout << "Ts:" << triangle_speed << std::endl;
	std::cout << "----" << std::endl << std::endl;

	// Search until all monochromatic triangles are eliminated.
	int minima = INT_MAX;
	while (minima > 0) {
	// Saturate the GPU with simulated annealing search threads.
	cudaEventRecord(start);
	search <<<blocks, block_size>>> (N, LIMIT, counts, edges, rng);
	cudaEventRecord(stop);
	cudaDeviceSynchronize();

	// Calculate the execution time and speed.
	cudaEventElapsedTime(&t, start, stop);
	triangle_speed = (float)nodes * LIMIT * N / (t * 1e-3);

	// Find the best improvement in the batch.
	int improvement = -1;
	for (int b = 0; b < nodes; b++) {
	if (counts[b] < minima) {
	improvement = b;
	minima = counts[b];
	}
	}

	// Overwrite the last monitor.
	std::cout << " \r";

	// Report improvements after each batch.
	if (improvement >= 0) {
	for (int i = 1; i < N; i++) {
	for (int j = 0; j < i; j++)
	std::cout << (int)edges[improvement * M + TRI(j, i)];
	if (i < N - 1) std::cout << ",";
	}
	std::cout << std::endl << std::endl;
	std::cout << minima << " :: ";
	}

	// Monitor the limit and processing speed.
	std::cout << "L:" << (float)LIMIT << ", ";
	std::cout << "T/s:" << triangle_speed << " ";

	// Start a new monitor after a report.
	if (improvement >= 0) {
	std::cout << std::endl << "----" << std::endl << std::endl;
	}
	std::cout << std::flush;

	// Slowly search deeper into the solution space.
	LIMIT += (LIMIT * 0.05 + 1);
	}

	return 0;
	}