Skip to content

Instantly share code, notes, and snippets.

@Snektron
Created January 12, 2020 19:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Snektron/40aa8be62db499148c588dee02a45e17 to your computer and use it in GitHub Desktop.
Save Snektron/40aa8be62db499148c588dee02a45e17 to your computer and use it in GitHub Desktop.
Find factors of 1 + 11! + (11!)!
// Finds factors of 1 + 11! + (11!)! using CUDA
// Primes to check whether they are a factor of above numbers
// are to be supplied one-per-line to the program.
// See https://cr.yp.to/primegen.html for generating primes.
// Compile with `nvcc -o fact fact.cu`.
// Run with `./fact ./primes.txt`
#include <vector>
#include <fstream>
#include <iostream>
#include <chrono>
#include <algorithm>
#include <cstdlib>
#define cuda_assert(ans) \
do { \
cuda_assert_impl((ans), __PRETTY_FUNCTION__, __LINE__); \
} while(0)
void cuda_assert_impl(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
std::cerr << "cuda_assert failed at " << file << ":" << line << ": "
<< cudaGetErrorString(code) << std::endl;
exit(EXIT_FAILURE);
}
}
// https://stackoverflow.com/questions/12252826/modular-arithmetic-on-the-gpu
// fast truncation of double-precision to integers
#define CUMP_D2I_TRUNC (double)(3ll << 51)
// computes r = a + b subop c unsigned using extended precision
#define VADDx(r, a, b, c, subop) \
asm volatile("vadd.u32.u32.u32." subop " %0, %1, %2, %3;" : \
"=r"(r) : "r"(a) , "r"(b), "r"(c));
// computes a * b mod m; invk = (double)(1<<30) / m
__device__ __forceinline__ unsigned mul_m(unsigned a, unsigned b, volatile unsigned m, volatile double invk) {
unsigned hi = __umulhi(a*2, b*2);
// 2 double instructions
double rf = __uint2double_rn(hi) * invk + CUMP_D2I_TRUNC;
unsigned r = (unsigned)__double2loint(rf);
r = a * b - r * m;
VADDx(r, r, m, r, "min");
return r;
}
__global__ void trail_division_kernel(unsigned n, unsigned* primes, unsigned* result) {
constexpr const unsigned p1 = 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9 * 10 * 11 + 1;
int id = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = id; i < n; i += stride) {
const unsigned p = primes[i];
unsigned prod = 1;
const double invk = (double)(1 << 30) / p;
for (unsigned j = 2; j < p1; ++j) {
prod = mul_m(prod, j, p, invk);
}
if ((prod + p1) % p == 0) {
atomicExch(result, p);
}
}
}
void trail_division(const std::vector<unsigned>& primes) {
auto ceil_div = [](auto a, auto b) {
return (a + b - 1) / b;
};
unsigned* gpu_primes;
cuda_assert(cudaMalloc(&gpu_primes, primes.size() * sizeof(unsigned)));
cuda_assert(cudaMemcpy(gpu_primes, primes.data(), primes.size() * sizeof(unsigned), cudaMemcpyHostToDevice));
cuda_assert(cudaDeviceSynchronize());
unsigned* gpu_result;
cuda_assert(cudaMalloc(&gpu_result, sizeof(unsigned)));
cuda_assert(cudaMemset(gpu_result, 0, sizeof(unsigned)));
cuda_assert(cudaDeviceSynchronize());
int sms;
int dev_id;
cuda_assert(cudaGetDevice(&dev_id));
cuda_assert(cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev_id));
const unsigned block_dim = 128;
const unsigned segment_size = block_dim * 32;
const unsigned grid_dim = ceil_div(segment_size, sms * block_dim);
const auto start = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < primes.size(); i += segment_size) {
unsigned n = std::min<unsigned>(segment_size, static_cast<unsigned>(primes.size()) - i);
trail_division_kernel<<<grid_dim * sms, block_dim>>>(
n,
gpu_primes + i,
gpu_result
);
unsigned result;
cuda_assert(cudaMemcpy(&result, gpu_result, sizeof(unsigned), cudaMemcpyDeviceToHost));
cuda_assert(cudaDeviceSynchronize());
if (result != 0) {
std::cout << "found factor: " << result << std::endl;
break;
}
const auto now = std::chrono::high_resolution_clock::now();
const auto elapsed = std::chrono::duration<double>(now - start).count();
std::cout << primes[i] << "-" << primes[i + n - 1] << "... (" << elapsed << "s, " << ((i + n) / elapsed) << " checks/s)" << std::endl;
}
cuda_assert(cudaFree(gpu_result));
cuda_assert(cudaFree(gpu_primes));
}
int main(int argc, char* argv[]) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <primes.txt>" << std::endl;
return EXIT_FAILURE;
}
auto primes_in = std::ifstream(argv[1]);
if (!primes_in) {
std::cerr << "Error: failed to open '" << argv[1] << "'" << std::endl;
return EXIT_FAILURE;
}
std::vector<unsigned> primes;
while (true) {
unsigned v;
primes_in >> v;
if (primes_in.eof()) {
break;
}
primes.push_back(v);
}
trail_division(primes);
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment