Snektron/fact.cu

## fact.cu
// Finds factors of 1 + 11! + (11!)! using CUDA
// Primes to check whether they are a factor of above numbers
// are to be supplied one-per-line to the program.
// See https://cr.yp.to/primegen.html for generating primes.
// Compile with `nvcc -o fact fact.cu`.
// Run with `./fact ./primes.txt`

#include <vector>
#include <fstream>
#include <iostream>
#include <chrono>
#include <algorithm>
#include <cstdlib>

#define cuda_assert(ans) \
    do { \
        cuda_assert_impl((ans), __PRETTY_FUNCTION__, __LINE__); \
    } while(0)

void cuda_assert_impl(cudaError_t code, const char *file, int line) {
    if (code != cudaSuccess) {
        std::cerr << "cuda_assert failed at " << file << ":" << line << ": "
            << cudaGetErrorString(code) << std::endl;

        exit(EXIT_FAILURE);
    }
}

// https://stackoverflow.com/questions/12252826/modular-arithmetic-on-the-gpu
// fast truncation of double-precision to integers
#define CUMP_D2I_TRUNC (double)(3ll << 51)
// computes r = a + b subop c unsigned using extended precision
#define VADDx(r, a, b, c, subop) \
    asm volatile("vadd.u32.u32.u32." subop " %0, %1, %2, %3;" :  \
            "=r"(r) : "r"(a) , "r"(b), "r"(c));

// computes a * b mod m; invk = (double)(1<<30) / m
__device__ __forceinline__ unsigned mul_m(unsigned a, unsigned b, volatile unsigned m, volatile double invk) {
    unsigned hi = __umulhi(a*2, b*2);
    // 2 double instructions
    double rf = __uint2double_rn(hi) * invk + CUMP_D2I_TRUNC;
    unsigned r = (unsigned)__double2loint(rf);
    r = a * b - r * m;

    VADDx(r, r, m, r, "min");
    return r;
}

__global__ void trail_division_kernel(unsigned n, unsigned* primes, unsigned* result) {
    constexpr const unsigned p1 = 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9 * 10 * 11 + 1;

    int id = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = id; i < n; i += stride) {
        const unsigned p = primes[i];
        unsigned prod = 1;
        const double invk = (double)(1 << 30) / p;

        for (unsigned j = 2; j < p1; ++j) {
            prod = mul_m(prod, j, p, invk);
        }

        if ((prod + p1) % p == 0) {
            atomicExch(result, p);
        }
    }
}

void trail_division(const std::vector<unsigned>& primes) {
    auto ceil_div = [](auto a, auto b) {
        return (a + b - 1) / b;
    };

    unsigned* gpu_primes;
    cuda_assert(cudaMalloc(&gpu_primes, primes.size() * sizeof(unsigned)));
    cuda_assert(cudaMemcpy(gpu_primes, primes.data(), primes.size() * sizeof(unsigned), cudaMemcpyHostToDevice));
    cuda_assert(cudaDeviceSynchronize());

    unsigned* gpu_result;
    cuda_assert(cudaMalloc(&gpu_result, sizeof(unsigned)));
    cuda_assert(cudaMemset(gpu_result, 0, sizeof(unsigned)));
    cuda_assert(cudaDeviceSynchronize());

    int sms;
    int dev_id;
    cuda_assert(cudaGetDevice(&dev_id));
    cuda_assert(cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev_id));

    const unsigned block_dim = 128;
    const unsigned segment_size = block_dim * 32;
    const unsigned grid_dim = ceil_div(segment_size, sms * block_dim);

    const auto start = std::chrono::high_resolution_clock::now();

    for (size_t i = 0; i < primes.size(); i += segment_size) {
        unsigned n = std::min<unsigned>(segment_size, static_cast<unsigned>(primes.size()) - i);
        trail_division_kernel<<<grid_dim * sms, block_dim>>>(
            n,
            gpu_primes + i,
            gpu_result
        );

        unsigned result;
        cuda_assert(cudaMemcpy(&result, gpu_result, sizeof(unsigned), cudaMemcpyDeviceToHost));
        cuda_assert(cudaDeviceSynchronize());

        if (result != 0) {
            std::cout << "found factor: " << result << std::endl;
            break;
        }

        const auto now = std::chrono::high_resolution_clock::now();
        const auto elapsed = std::chrono::duration<double>(now - start).count();
        std::cout << primes[i] << "-" << primes[i + n - 1] << "... (" << elapsed << "s, " << ((i + n) / elapsed) << " checks/s)" << std::endl;
    }

    cuda_assert(cudaFree(gpu_result));
    cuda_assert(cudaFree(gpu_primes));
}

int main(int argc, char* argv[]) {
    if (argc != 2) {
        std::cerr << "Usage: " << argv[0] << " <primes.txt>" << std::endl;
        return EXIT_FAILURE;
    }

    auto primes_in = std::ifstream(argv[1]);
    if (!primes_in) {
        std::cerr << "Error: failed to open '" << argv[1] << "'" << std::endl;
        return EXIT_FAILURE;
    }

    std::vector<unsigned> primes;
    while (true) {
        unsigned v;
        primes_in >> v;
        if (primes_in.eof()) {
            break;
        }

        primes.push_back(v);
    }

    trail_division(primes);

    return EXIT_SUCCESS;
}
	// Finds factors of 1 + 11! + (11!)! using CUDA
	// Primes to check whether they are a factor of above numbers
	// are to be supplied one-per-line to the program.
	// See https://cr.yp.to/primegen.html for generating primes.
	// Compile with `nvcc -o fact fact.cu`.
	// Run with `./fact ./primes.txt`

	#include <vector>
	#include <fstream>
	#include <iostream>
	#include <chrono>
	#include <algorithm>
	#include <cstdlib>

	#define cuda_assert(ans) \
	do { \
	cuda_assert_impl((ans), __PRETTY_FUNCTION__, __LINE__); \
	} while(0)

	void cuda_assert_impl(cudaError_t code, const char *file, int line) {
	if (code != cudaSuccess) {
	std::cerr << "cuda_assert failed at " << file << ":" << line << ": "
	<< cudaGetErrorString(code) << std::endl;

	exit(EXIT_FAILURE);
	}
	}

	// https://stackoverflow.com/questions/12252826/modular-arithmetic-on-the-gpu
	// fast truncation of double-precision to integers
	#define CUMP_D2I_TRUNC (double)(3ll << 51)
	// computes r = a + b subop c unsigned using extended precision
	#define VADDx(r, a, b, c, subop) \
	asm volatile("vadd.u32.u32.u32." subop " %0, %1, %2, %3;" : \
	"=r"(r) : "r"(a) , "r"(b), "r"(c));

	// computes a * b mod m; invk = (double)(1<<30) / m
	__device__ __forceinline__ unsigned mul_m(unsigned a, unsigned b, volatile unsigned m, volatile double invk) {
	unsigned hi = __umulhi(a2, b2);
	// 2 double instructions
	double rf = __uint2double_rn(hi) * invk + CUMP_D2I_TRUNC;
	unsigned r = (unsigned)__double2loint(rf);
	r = a * b - r * m;

	VADDx(r, r, m, r, "min");
	return r;
	}

	__global__ void trail_division_kernel(unsigned n, unsigned* primes, unsigned* result) {
	constexpr const unsigned p1 = 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9 * 10 * 11 + 1;

	int id = blockIdx.x * blockDim.x + threadIdx.x;
	int stride = blockDim.x * gridDim.x;

	for (int i = id; i < n; i += stride) {
	const unsigned p = primes[i];
	unsigned prod = 1;
	const double invk = (double)(1 << 30) / p;

	for (unsigned j = 2; j < p1; ++j) {
	prod = mul_m(prod, j, p, invk);
	}

	if ((prod + p1) % p == 0) {
	atomicExch(result, p);
	}
	}
	}

	void trail_division(const std::vector<unsigned>& primes) {
	auto ceil_div = [](auto a, auto b) {
	return (a + b - 1) / b;
	};

	unsigned* gpu_primes;
	cuda_assert(cudaMalloc(&gpu_primes, primes.size() * sizeof(unsigned)));
	cuda_assert(cudaMemcpy(gpu_primes, primes.data(), primes.size() * sizeof(unsigned), cudaMemcpyHostToDevice));
	cuda_assert(cudaDeviceSynchronize());

	unsigned* gpu_result;
	cuda_assert(cudaMalloc(&gpu_result, sizeof(unsigned)));
	cuda_assert(cudaMemset(gpu_result, 0, sizeof(unsigned)));
	cuda_assert(cudaDeviceSynchronize());

	int sms;
	int dev_id;
	cuda_assert(cudaGetDevice(&dev_id));
	cuda_assert(cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev_id));

	const unsigned block_dim = 128;
	const unsigned segment_size = block_dim * 32;
	const unsigned grid_dim = ceil_div(segment_size, sms * block_dim);

	const auto start = std::chrono::high_resolution_clock::now();

	for (size_t i = 0; i < primes.size(); i += segment_size) {
	unsigned n = std::min<unsigned>(segment_size, static_cast<unsigned>(primes.size()) - i);
	trail_division_kernel<<<grid_dim * sms, block_dim>>>(
	n,
	gpu_primes + i,
	gpu_result
	);

	unsigned result;
	cuda_assert(cudaMemcpy(&result, gpu_result, sizeof(unsigned), cudaMemcpyDeviceToHost));
	cuda_assert(cudaDeviceSynchronize());

	if (result != 0) {
	std::cout << "found factor: " << result << std::endl;
	break;
	}

	const auto now = std::chrono::high_resolution_clock::now();
	const auto elapsed = std::chrono::duration<double>(now - start).count();
	std::cout << primes[i] << "-" << primes[i + n - 1] << "... (" << elapsed << "s, " << ((i + n) / elapsed) << " checks/s)" << std::endl;
	}

	cuda_assert(cudaFree(gpu_result));
	cuda_assert(cudaFree(gpu_primes));
	}

	int main(int argc, char* argv[]) {
	if (argc != 2) {
	std::cerr << "Usage: " << argv[0] << " <primes.txt>" << std::endl;
	return EXIT_FAILURE;
	}

	auto primes_in = std::ifstream(argv[1]);
	if (!primes_in) {
	std::cerr << "Error: failed to open '" << argv[1] << "'" << std::endl;
	return EXIT_FAILURE;
	}

	std::vector<unsigned> primes;
	while (true) {
	unsigned v;
	primes_in >> v;
	if (primes_in.eof()) {
	break;
	}

	primes.push_back(v);
	}

	trail_division(primes);

	return EXIT_SUCCESS;
	}