Created
January 12, 2020 19:01
-
-
Save Snektron/40aa8be62db499148c588dee02a45e17 to your computer and use it in GitHub Desktop.
Find factors of 1 + 11! + (11!)!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Finds factors of 1 + 11! + (11!)! using CUDA | |
// Primes to check whether they are a factor of above numbers | |
// are to be supplied one-per-line to the program. | |
// See https://cr.yp.to/primegen.html for generating primes. | |
// Compile with `nvcc -o fact fact.cu`. | |
// Run with `./fact ./primes.txt` | |
#include <vector> | |
#include <fstream> | |
#include <iostream> | |
#include <chrono> | |
#include <algorithm> | |
#include <cstdlib> | |
#define cuda_assert(ans) \ | |
do { \ | |
cuda_assert_impl((ans), __PRETTY_FUNCTION__, __LINE__); \ | |
} while(0) | |
void cuda_assert_impl(cudaError_t code, const char *file, int line) { | |
if (code != cudaSuccess) { | |
std::cerr << "cuda_assert failed at " << file << ":" << line << ": " | |
<< cudaGetErrorString(code) << std::endl; | |
exit(EXIT_FAILURE); | |
} | |
} | |
// https://stackoverflow.com/questions/12252826/modular-arithmetic-on-the-gpu | |
// fast truncation of double-precision to integers | |
#define CUMP_D2I_TRUNC (double)(3ll << 51) | |
// computes r = a + b subop c unsigned using extended precision | |
#define VADDx(r, a, b, c, subop) \ | |
asm volatile("vadd.u32.u32.u32." subop " %0, %1, %2, %3;" : \ | |
"=r"(r) : "r"(a) , "r"(b), "r"(c)); | |
// computes a * b mod m; invk = (double)(1<<30) / m | |
__device__ __forceinline__ unsigned mul_m(unsigned a, unsigned b, volatile unsigned m, volatile double invk) { | |
unsigned hi = __umulhi(a*2, b*2); | |
// 2 double instructions | |
double rf = __uint2double_rn(hi) * invk + CUMP_D2I_TRUNC; | |
unsigned r = (unsigned)__double2loint(rf); | |
r = a * b - r * m; | |
VADDx(r, r, m, r, "min"); | |
return r; | |
} | |
__global__ void trail_division_kernel(unsigned n, unsigned* primes, unsigned* result) { | |
constexpr const unsigned p1 = 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9 * 10 * 11 + 1; | |
int id = blockIdx.x * blockDim.x + threadIdx.x; | |
int stride = blockDim.x * gridDim.x; | |
for (int i = id; i < n; i += stride) { | |
const unsigned p = primes[i]; | |
unsigned prod = 1; | |
const double invk = (double)(1 << 30) / p; | |
for (unsigned j = 2; j < p1; ++j) { | |
prod = mul_m(prod, j, p, invk); | |
} | |
if ((prod + p1) % p == 0) { | |
atomicExch(result, p); | |
} | |
} | |
} | |
void trail_division(const std::vector<unsigned>& primes) { | |
auto ceil_div = [](auto a, auto b) { | |
return (a + b - 1) / b; | |
}; | |
unsigned* gpu_primes; | |
cuda_assert(cudaMalloc(&gpu_primes, primes.size() * sizeof(unsigned))); | |
cuda_assert(cudaMemcpy(gpu_primes, primes.data(), primes.size() * sizeof(unsigned), cudaMemcpyHostToDevice)); | |
cuda_assert(cudaDeviceSynchronize()); | |
unsigned* gpu_result; | |
cuda_assert(cudaMalloc(&gpu_result, sizeof(unsigned))); | |
cuda_assert(cudaMemset(gpu_result, 0, sizeof(unsigned))); | |
cuda_assert(cudaDeviceSynchronize()); | |
int sms; | |
int dev_id; | |
cuda_assert(cudaGetDevice(&dev_id)); | |
cuda_assert(cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev_id)); | |
const unsigned block_dim = 128; | |
const unsigned segment_size = block_dim * 32; | |
const unsigned grid_dim = ceil_div(segment_size, sms * block_dim); | |
const auto start = std::chrono::high_resolution_clock::now(); | |
for (size_t i = 0; i < primes.size(); i += segment_size) { | |
unsigned n = std::min<unsigned>(segment_size, static_cast<unsigned>(primes.size()) - i); | |
trail_division_kernel<<<grid_dim * sms, block_dim>>>( | |
n, | |
gpu_primes + i, | |
gpu_result | |
); | |
unsigned result; | |
cuda_assert(cudaMemcpy(&result, gpu_result, sizeof(unsigned), cudaMemcpyDeviceToHost)); | |
cuda_assert(cudaDeviceSynchronize()); | |
if (result != 0) { | |
std::cout << "found factor: " << result << std::endl; | |
break; | |
} | |
const auto now = std::chrono::high_resolution_clock::now(); | |
const auto elapsed = std::chrono::duration<double>(now - start).count(); | |
std::cout << primes[i] << "-" << primes[i + n - 1] << "... (" << elapsed << "s, " << ((i + n) / elapsed) << " checks/s)" << std::endl; | |
} | |
cuda_assert(cudaFree(gpu_result)); | |
cuda_assert(cudaFree(gpu_primes)); | |
} | |
int main(int argc, char* argv[]) { | |
if (argc != 2) { | |
std::cerr << "Usage: " << argv[0] << " <primes.txt>" << std::endl; | |
return EXIT_FAILURE; | |
} | |
auto primes_in = std::ifstream(argv[1]); | |
if (!primes_in) { | |
std::cerr << "Error: failed to open '" << argv[1] << "'" << std::endl; | |
return EXIT_FAILURE; | |
} | |
std::vector<unsigned> primes; | |
while (true) { | |
unsigned v; | |
primes_in >> v; | |
if (primes_in.eof()) { | |
break; | |
} | |
primes.push_back(v); | |
} | |
trail_division(primes); | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment