Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Cousin Primes generator, multi-threaded, using SSoZ (Segmented Sieve of Zakiya), written in D
/* This D source file is a multiple threaded implementation to perform an
* extremely fast Segmented Sieve of Zakiya (SSoZ) to find Cousin Primes <= N.
*
* Inputs are single values N, or ranges N1 and N2, of 64-bits, 0 -- 2^64 - 1.
* Output is the number of cousin primes <= N, or in range N1 to N2; the last
* cousin prime value for the range; and the total time of execution.
*
* This code was developed on a System76 laptop with an Intel I7 6700HQ cpu,
* 2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. Parameter tuning
* would be needed to optimize for other hadware systems (ARM, PowerPC, etc).
*
* To compile with ldc2: $ ldc2 --release -O3 cousinprimes_ssoz.d
* To reduce binary size do: $ strip cousinprimes_ssoz
* Then run executable: $ ./cousinprimes_ssoz <cr>, and enter range values.
* Or alternatively: $ echo <range1> <range2> | ./cousinprimes_ssoz
* Range values can be provided in either order (larger or smaller first).
*
* This D source file, and updates, will be available here:
* https://gist.github.com/jzakiya/147747d391b5b0432c7967dd17dae124
*
* Mathematical and technical basis for implementation are explained here:
* https://www.academia.edu/37952623/The_Use_of_Prime_Generators_to_Implement_Fast_
* Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_
* for_the_Riemann_Hypotheses
* https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ_
* https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK
* https://www.academia.edu/81206391/Twin_Primes_Segmented_Sieve_of_Zakiya_SSoZ_Explained
*
* This code is provided free and subject to copyright and terms of the
* GNU General Public License Version 3, GPLv3, or greater.
* License copy/terms are here: http://www.gnu.org/licenses/
*
* Copyright (c) 2017-2022 Jabari Zakiya -- jzakiya at gmail dot com
* Version Date: 2022/10/07
*/
module cousinprimes_ssoz;
import std.algorithm : min, max, swap, sort;
import std.datetime.stopwatch : StopWatch;
import std.math : sqrt;
import std.parallelism : parallel, totalCPUs;
import std.range : iota;
import std.stdio : readf, write, writeln, readln;
import std.numeric : gcd;
import std.array: appender;
import core.bitop: popcnt;
import core.atomic: atomicOp;
/// Compute modular inverse a^-1 of a to base b, e.g. a*(a^-1) mod b = 1.
int modinv(int a0, int b0) pure @nogc
{
int a = a0;
int b = b0;
int x0 = 0;
int result = 1;
if (b == 1) { return result; }
while (a > 1) {
immutable q = a / b;
a = a % b;
swap(a, b);
result -= q * x0;
swap(x0, result);
}
if (result < 0) { result += b0; }
return result;
}
// Global parameters
shared uint Ks = 0; // segment size for each seg restrack
shared ulong start_num; // lo number for range
shared ulong end_num; // hi number for range
shared ulong Kmax; // number of resgroups to end_num
shared ulong Kmin; // number of resgroups to start_num
shared ulong[] primes; // list of primes r1..sqrt(N)
shared uint[] cnts; // holds cousin primes count for each cp|thread
shared ulong[] lastcousins; // holds largest cousin prime for each cp|thread
shared uint modpg; // PG's modulus value
shared uint res_0; // PG's first residue value
shared ulong[] rescousins; // PG's list of cousinpair residues
shared ulong[] resinvrs; // PG's list of residues inverses
shared uint bn; // segment size factor for PG and input number7
/// Creates constant parameters for selected Prime Generator at runtime.
void genPGparameters(int prime)
{
writeln("using parameters for P", prime);
static immutable primes = [2, 3, 5, 7, 11, 13, 17, 19, 23];
uint modpn = 1;
uint res0 = 0; // compute Pn's modulus and res_0 value
foreach (prm; primes) { res0 = prm; if (prm > prime) break; modpn *= prm; }
rescousins = []; // save upper cousin pair residues here
resinvrs = new ulong[modpn + 2]; // save Pn's residues inverses here
uint pc = 5; // use P3's PGS to generate pcs
uint inc = 2; // init value in its seq: 2 4 2 4 2 4 ...
auto res = 0; // init residue value
auto midmodpn = modpn >> 1; // mid value of modpn
while (pc < midmodpn) { // find PG's 1st half residues
if (gcd(pc, modpn) == 1) { // if pc a residue
auto mc = modpn - pc; // create its modular complement
resinvrs[pc] = modinv(pc,modpn); // save pc and mc inverses
resinvrs[mc] = modinv(mc,modpn); // if in cousinprimes save both hi residues
if (res + 4 == pc) { rescousins ~= pc; rescousins ~= mc + 4; }
res = pc; // save current found residue
}
pc += inc; inc = inc ^ 0b110; // create next P3 sequence pc: 5 7 11 13 17 19 ...
}
rescousins ~= midmodpn + 2; rescousins.sort; // create pivot hi_cp
resinvrs[modpn + 1] = 1; resinvrs[modpn - 1] = modpn - 1; // last 2 residues are self inverses
modpg = modpn; res_0 = res0; // save global variables
}
/// Select at runtime best PG and segment size factor to use for input range.
/// These are good estimates derived from PG data profiling. Can be improved.
void setSieveParameters(ulong start_range, ulong end_range) {
ulong range = end_range - start_range;
int pg = 3;
if (end_range < 49) {
bn = 1; pg = 3;
} else if (range < 77_000_000) {
bn = 16; pg = 5;
} else if (range < 1_100_000_000) {
bn = 32; pg = 7;
} else if (range < 35_500_000_000uL) {
bn = 64; pg = 11;
} else if (range < 14_000_000_000_000uL) {
pg = 13;
if (range > 7_000_000_000_000uL) { bn = 384; }
else if (range > 2_500_000_000_000uL) { bn = 320; }
else if (range > 250_000_000_000uL) { bn = 196; }
else { bn = 128; }
}
else {
bn = 384; pg = 17;
}
genPGparameters(pg);
auto pairscnt = rescousins.length; // number of cousin pairs|threads for selected PG
Kmin = (start_num-2) / modpg + 1; // number of resgroups to start_num
Kmax = (end_num - 2) / modpg + 1; // number of resgroups to end_num
auto krange = Kmax - Kmin + 1; // number of range resgroups, at least 1
auto n = (krange < 37_500_000_000_000uL) ? 4 : (krange < 950_000_000_000_000uL) ? 6 : 8;
auto b = bn * 1024 * n; // set seg size to optimize for selected PG
Ks = cast(uint)((krange < b) ? krange : b); // segments resgroups size
writeln("segment size = ",Ks," resgroups; seg array is [1 x ",((Ks-1) >> 6) + 1,"] 64-bits");
auto maxpairs = krange * pairscnt; // maximum number of cousin prime pcs
writeln("cousinprime candidates = ", maxpairs, "; resgroups = ", krange);
}
/// Compute the primes r1..sqrt(input_num), store in global 'primes' array.
/// Any algorithm (fast|small) is usable. Here the SoZ for P5 is used.
void sozPg(ulong val) {
uint md = 30; // P5's modulus value
ulong rscnt = 8; // P5's residue count
static immutable res = [7, 11, 13, 17, 19, 23, 29, 31];
static immutable posn = [0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,3,0,4,0,0,0,5,0,0,0,0,0,6,0,7];
ulong kmax = (val - 2) / md + 1; // number of resgroups upto input value
ubyte[] prms = new ubyte[](kmax); // byte array of prime candidates init '0'
ulong sqrtN = cast(ulong) sqrt(cast(double) val); // integer sqrt of val
uint modk = 0; int r = -1; uint k = 0; // init residue parameters
// mark the multiples of the primes r1..sqrtN in 'prms'
while (true) {
++r; if (r == rscnt) {r = 0; modk += md; ++k;}
if ((prms[k] & (1 << r)) != 0) continue; // skip pc if not prime
uint prm_r = res[r]; // if prime save its residue value
ulong prime = modk + prm_r; // numerate the prime value
if (prime > sqrtN) break; // we're finished if it's > sqrtN
foreach (ri; res) { // mark prime's multiples in prms
uint prod = prm_r * ri - 2; // compute cross-product for prm_r|ri pair
ubyte bit_r = cast(ubyte) (1 << posn[prod % md]); // prod's residue bit mask
ulong kpm = (k * (prime + ri) + prod / md); // 1st resgroup for prime mult
while (kpm < kmax) { prms[kpm] |= bit_r; kpm += prime; }
}
}
// prms now contains the nonprime positions for the prime candidates r1..N
// extract only primes that are in inputs range into array 'primes'
primes = []; // create empty dynamic array for primes
foreach (kk, resgroup; prms) { // for each kth residue group
foreach (i, res_i; res) { // check for each ith residue in resgroup
if ((resgroup & (1 << i)) == 0) { // if bit location a prime
ulong prime = md * kk + res_i; // numerate its value, store if in range
// check if prime has multiple in range, if so keep it, if not don't
ulong n = start_num / prime; ulong rem = start_num % prime;
if (((res_0 <= prime) && (prime <= val)) && (prime * n <= end_num - prime || rem == 0)) { primes ~= prime; }
} } } }
/// Initialize 'nextp' array for given cousin pair residues for thread.
/// Compute 1st prime multiple resgroups in range for each prime r1..sqrt(N)
/// and store consecutively as lo_cp|hi_cp pairs for their restracks.
ulong[] nextp_init(ulong hi_r, ulong k_min) {
ulong[] nextp = new ulong[](primes.length * 2); // 1st mults array for tp
ulong r_hi = hi_r; // upper cousin pair value
ulong r_lo = hi_r - 4; // lower cousin pair value
foreach(size_t j, prime; primes) { // for each prime r1..sqrt(N)
auto k = (prime - 2) / modpg; // find the resgroup it's in
auto r = (prime - 2) % modpg + 2; // and its residue value
auto r_inv = resinvrs[r]; // and its residue inverse
auto rl = (r_lo * r_inv - 2) % modpg + 2; // compute the ri for r for lo_cp
auto rh = (r_hi * r_inv - 2) % modpg + 2; // compute the ri for r for hi_tp
auto kl = k * (prime + rl) + (r * rl - 2) / modpg;
auto kh = k * (prime + rh) + (r * rh - 2) / modpg;
if (kl < k_min) { // if 1st mult index < start_num's
kl = (k_min - kl) % prime; // how many indices short is it
if (kl > 0) kl = prime - kl; // adjust index value into range
} else { kl = kl - k_min; } // else here, addjust index if it was >
if (kh < k_min) { // if 1st mult index < start_num's
kh = (k_min - kh) % prime; // how many indices short is it
if (kh > 0) kh = prime - kh; // adjust index value into range
} else { kh = kh - k_min; } // else here, addjust index if it was >
nextp[j * 2] = kl; // lo_cp index val >= start of range
nextp[j * 2 | 1] = kh; // hi_cp index val >= start of range
}
return nextp;
}
/// Perform in a thread, the ssoz for a given cousinpair, for Kmax resgroups.
/// First create|init 'nextp' array of 1st prime mults for given cousin pair,
/// (stored consequtively in 'nextp') and init seg byte array for Ks resgroups.
/// For sieve, mark resgroup bits to '1' if either cousinpair restrack is nonprime,
/// for primes mults resgroups, and update 'nextp' restrack slices acccordingly.
/// Find last cousin prime|sum for range, store in their arrays for this cousinpair.
/// Can optionally compile to print mid cousinprime values generated by cousinpair.
void cousinsSieve(uint indx) {
uint s = 6; // shift value for 64 bits
uint bmask = (1 << s) - 1; // bitmask val for 64 bits
auto sum = 0; // cousins count for cousin pair for segment
ulong Ki = Kmin - 1; // first resgroup val for range for slice
ulong K_max = Kmax; // last resgroup val for range for slice
uint Kn = Ks; // resgroup seg size for each seg slice
ulong hi_cp = 0; // value of largest cp hi prime in seg
ulong r_hi = rescousins[indx]; // hi cp residue for this thread
ulong[] seg = new ulong[](((Ks-1) >> s) + 1); // seg array for Ks regroups
if (r_hi - 4 < (start_num - 2) % modpg + 2) ++Ki; // ensure lo cp in range
if (r_hi > (end_num - 2) % modpg + 2) --K_max; // ensure hi cp in range
ulong[] nextp = nextp_init(r_hi, Ki); // 1st prime mults for cousin pair restracks
while (Ki < K_max) { // for Kn resgroup size slices upto Kmax
Kn = min(Ks, K_max - Ki); // set segment slice resgroup length
foreach (j, prime; primes) { // for each prime index r1..sqrt(N)
// for lower pair residue track
ulong k = nextp[j * 2]; // starting from this lower resgroup in seg
while (k < Kn) { // mark primenth resgroup bits prime mults
seg[k >> s] |= 1uL << (k & bmask);
k += prime; } // set next prime multiple resgroup
nextp[j * 2] = k - Kn; // save 1st resgroup in next eligible seg
// for upper pair residue track
k = nextp[j * 2 | 1]; // starting from this upper resgroup in seg
while (k < Kn) { // mark primenth resgroup prime mults
seg[k >> s] |= 1uL << (k & bmask);
k += prime; } // set next prime multiple resgroup
nextp[j * 2 | 1] = k - Kn; // save 1st resgroup in next eligible seg
} // set as nonprime unused bits in last seg[n]
// so fast, do for every seg[i]
seg[(Kn-1) >> s] = seg[(Kn-1) >> s] | (~1uL << ((Kn-1) & bmask));
auto cnt = 0; // count the cousinprimes in the segment
foreach (i; 0..((Kn-1) >> s) + 1) { cnt += popcnt(~seg[i]); }
if (cnt > 0) { // if segment has cousin primes
sum += cnt; // add the segment count to total count
uint upk = Kn - 1; // from end of seg count backwards to largest cp
while ((seg[upk >> s] & (1uL << (upk & bmask))) != 0) --upk;
hi_cp = Ki + upk; // numerate its full resgroup value
}
Ki += Ks; // set 1st resgroup val of next seg slice
if (Ki < K_max) {foreach (b; 0..((Kn-1) >> s) + 1) seg[b] = 0;} // set seg all primes
}
// numerate largest cousin prime in seg
hi_cp = ((r_hi > end_num) || (sum == 0)) ? 0 : hi_cp * modpg + r_hi;
lastcousins[indx] = hi_cp; // store final seg cp value
cnts[indx] = sum; // sum for cousin pair
}
/// Main routine to run cousinprimes sieve, input ranges within 0..2^64 - 1.
void cousinPrimesSsoz() {
ulong[] x;
foreach (_; 0 .. 2) { ulong a; readf!" %d"(a); x ~= a; }
end_num = max(x[0], 3);
start_num = max(x[1], 3);
if (start_num > end_num) swap(start_num, end_num);
start_num = start_num | 1; // if start_num even add 1
end_num = (end_num - 1) | 1; // if end_num even subtract 1
if (end_num - start_num < 4) { start_num = 7; end_num = 7; }
writeln("threads = ", totalCPUs);
auto stopWatchSetup = StopWatch();
stopWatchSetup.start(); // start timing sieve setup execution
setSieveParameters(start_num, end_num);
auto pairscnt = rescousins.length;
auto krange = Kmax - Kmin + 1;
// create sieve primes <= sqrt(end_num), only use those whose multiples within inputs range
if (end_num < 49) primes ~= 5; // generate sieving primes
else sozPg(cast(ulong) sqrt(cast(double) end_num)); // <= sqrt(end_num)
writeln("each of ", pairscnt, " threads has nextp[", 2, " x ", primes.length, "] array");
stopWatchSetup.stop(); // sieve setup time
writeln("setup time = ", stopWatchSetup.peek());
writeln("perform cousinprimes ssoz sieve");
auto stopWatchExecution = StopWatch(); // start timing ssoz sieve execution
stopWatchExecution.start();
cnts = new uint[](pairscnt); // count of cps found per thread
lastcousins = new ulong[](pairscnt); // largest hi_cp for each thread
shared uint threadscnt = 0; // count of finished threads
foreach (indx; parallel(iota(0, pairscnt))) { // do in parallel for each cp
cousinsSieve(cast(uint) indx); // sieve selected cousinpair restracks
write("\r", threadscnt.atomicOp!"+="(1), " of ", pairscnt, " cousinpairs done");
}
write("\r", pairscnt, " of ", pairscnt, " cousinpairs done");
ulong cousinscnt = 0uL; // count of cousin primes in range
ulong last_cousin = 0uL; // largest hi_cp cousin in range
if (end_num < 49) { // check for cousins in low ranges
foreach (c_hi; [11, 17, 23, 41, 47]) {
if (start_num <= c_hi - 4 && c_hi <= end_num) { last_cousin = c_hi; cousinscnt += 1; }
} }
else { // check for cousins from sieve
foreach(i; 0..pairscnt) { // find largest cousinprime|count in range
cousinscnt += cnts[i];
if (last_cousin < lastcousins[i]) last_cousin = lastcousins[i];
} }
// account for 1st cousin prime, defined as (3, 7)
if (start_num == 3 && end_num > 6) { cousinscnt += 1; if (end_num < 11) last_cousin = 7;}
auto Kn = krange % Ks; // set num of resgroups in last slice
if (Kn == 0) Kn = Ks; // if mult of segsize set to seg size
stopWatchExecution.stop(); // sieve execution time
writeln("\nsieve time = ", stopWatchExecution.peek());
writeln("total time = ", stopWatchSetup.peek() + stopWatchExecution.peek());
writeln("last segment = ", Kn, " resgroups; segment slices = ", (krange-1) / Ks + 1);
writeln("total cousins = ", cousinscnt, "; last cousin = ", last_cousin, "|-4");
}
void main()
{ cousinPrimesSsoz(); }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment