Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Twinprimes generator, multi-threaded, using SSoZ (Segmented Sieve of Zakiya), written in D
/* This D source file is a multiple threaded implementation to perform an
* extremely fast Segmented Sieve of Zakiya (SSoZ) to find Twin Primes <= N.
*
* Inputs are single values N, or ranges N1 and N2, of 64-bits, 0 -- 2^64 - 1.
* Output is the number of twin primes <= N, or in range N1 to N2; the last
* twin prime value for the range; and the total time of execution.
*
* This code was developed on a System76 laptop with an Intel I7 6700HQ cpu,
* 2.6-3.5 GHz clock, with 8 threads, and 16GB of memory. Parameter tuning
* would be needed to optimize for other hadware systems (ARM, PowerPC, etc).
*
* To compile with ldc2: $ ldc2 --release -O3 twinprimes_ssozgist.d
* To reduce binary size do: $ strip cousinprimes_ssoz
* Then run executable: $ ./twinprimes_ssoz <cr>, and enter range values.
* Or alternatively: $ echo <range1> <range2> | ./twinprimes_ssoz
* Range values can be provided in either order (larger or smaller first).
*
* This D source file, and updates, will be available here:
* https://gist.github.com/jzakiya/ae93bfa03dbc8b25ccc7f97ff8ad0f61
*
* Mathematical and technical basis for implementation are explained here:
* https://www.academia.edu/37952623/The_Use_of_Prime_Generators_to_Implement_Fast_
* Twin_Primes_Sieve_of_Zakiya_SoZ_Applications_to_Number_Theory_and_Implications_
* for_the_Riemann_Hypotheses
* https://www.academia.edu/7583194/The_Segmented_Sieve_of_Zakiya_SSoZ_
* https://www.academia.edu/19786419/PRIMES-UTILS_HANDBOOK
* https://www.academia.edu/81206391/Twin_Primes_Segmented_Sieve_of_Zakiya_SSoZ_Explained
*
* This code is provided free and subject to copyright and terms of the
* GNU General Public License Version 3, GPLv3, or greater.
* License copy/terms are here: http://www.gnu.org/licenses/
*
* Copyright (c) 2017-2022 Jabari Zakiya -- jzakiya at gmail dot com
* Version Date: 2022/10/07
*/
module twinprimes_ssoz;
import std.algorithm : min, max, swap, sort;
import std.datetime.stopwatch : StopWatch;
import std.math : sqrt;
import std.parallelism : parallel, totalCPUs;
import std.range : iota;
import std.stdio : readf, write, writeln, readln;
import std.numeric : gcd;
import std.array: appender;
import core.bitop: popcnt;
import core.atomic: atomicOp;
/// Compute modular inverse a^-1 of a to base b, e.g. a*(a^-1) mod b = 1.
int modinv(int a0, int b0) pure @nogc
{
int a = a0;
int b = b0;
int x0 = 0;
int result = 1;
if (b == 1) { return result; }
while (a > 1) {
immutable q = a / b;
a = a % b;
swap(a, b);
result -= q * x0;
swap(x0, result);
}
if (result < 0) { result += b0; }
return result;
}
// Global parameters
shared uint Ks = 0; // segment size for each seg restrack
shared ulong start_num; // lo number for range
shared ulong end_num; // hi number for range
shared ulong Kmax; // number of resgroups to end_num
shared ulong Kmin; // number of resgroups to start_num
shared ulong[] primes; // list of primes r1..sqrt(N)
shared uint[] cnts; // holds twin primes count for each tp|thread
shared ulong[] lastwins; // holds largest twin prime for each tp|thread
shared uint modpg; // PG's modulus value
shared uint res_0; // PG's first residue value
shared ulong[] restwins; // PG's list of twinpair residues
shared ulong[] resinvrs; // PG's list of residues inverses
shared uint bn; // segment size factor for PG and input number
/// Creates constant parameters for selected Prime Generator at runtime.
void genPGparameters(int prime)
{
writeln("using parameters for P", prime);
static immutable primes = [2, 3, 5, 7, 11, 13, 17, 19, 23];
uint modpn = 1;
uint res0 = 0; // compute Pn's modulus and res_0 value
foreach (prm; primes) { res0 = prm; if (prm > prime) break; modpn *= prm; }
restwins = []; // save upper twin pair residues here
resinvrs = new ulong[modpn + 2]; // save Pn's residues inverses here
uint pc = 5; // use P3's PGS to generate pcs
uint inc = 2; // init value in its seq: 2 4 2 4 2 4 ...
auto res = 0; // init residue value
while (pc < (modpn >> 1)) { // find PG's 1st half residues
if (gcd(pc, modpn) == 1) { // if pc a residue
auto mc = modpn - pc; // create its modular complement
resinvrs[pc] = modinv(pc,modpn); // save pc and mc inverses
resinvrs[mc] = modinv(mc,modpn); // if in twinpair save both hi residues
if (res + 2 == pc) { restwins ~= pc; restwins ~= mc + 2; }
res = pc; // save current found residue
}
pc += inc; inc = inc ^ 0b110; // create next P3 sequence pc: 5 7 11 13 17 19 ...
}
restwins.sort; restwins ~= modpn + 1; // last residue is last hi_tp
resinvrs[modpn + 1] = 1; resinvrs[modpn - 1] = modpn - 1; // last 2 residues are self inverses
modpg = modpn; res_0 = res0; // save global variables
}
/// Select at runtime best PG and segment size factor to use for input range.
/// These are good estimates derived from PG data profiling. Can be improved.
void setSieveParameters(ulong start_range, ulong end_range) {
ulong range = end_range - start_range;
int pg = 3;
if (end_range < 49) {
bn = 1; pg = 3;
} else if (range < 77_000_000) {
bn = 16; pg = 5;
} else if (range < 1_100_000_000) {
bn = 32; pg = 7;
} else if (range < 35_500_000_000uL) {
bn = 64; pg = 11;
} else if (range < 14_000_000_000_000uL) {
pg = 13;
if (range > 7_000_000_000_000uL) { bn = 384; }
else if (range > 2_500_000_000_000uL) { bn = 320; }
else if (range > 250_000_000_000uL) { bn = 196; }
else { bn = 128; }
}
else {
bn = 384; pg = 17;
}
genPGparameters(pg);
auto pairscnt = restwins.length; // number of twin pairs|threads for selected PG
Kmin = (start_num-2) / modpg + 1; // number of resgroups to start_num
Kmax = (end_num - 2) / modpg + 1; // number of resgroups to end_num
auto krange = Kmax - Kmin + 1; // number of range resgroups, at least 1
auto n = (krange < 37_500_000_000_000uL) ? 4 : (krange < 950_000_000_000_000uL) ? 6 : 8;
auto b = bn * 1024 * n; // set seg size to optimize for selected PG
Ks = cast(uint)((krange < b) ? krange : b); // segments resgroups size
writeln("segment size = ",Ks," resgroups; seg array is [1 x ",((Ks-1) >> 6) + 1,"] 64-bits");
auto maxpairs = krange * pairscnt; // maximum number of twin prime pcs
writeln("twinprime candidates = ", maxpairs, "; resgroups = ", krange);
}
/// Compute the primes r1..sqrt(input_num), store in global 'primes' array.
/// Any algorithm (fast|small) is usable. Here the SoZ for P5 is used.
void sozPg(ulong val) {
uint md = 30; // P5's modulus value
ulong rscnt = 8; // P5's residue count
static immutable res = [7, 11, 13, 17, 19, 23, 29, 31];
static immutable posn = [0,0,0,0,0,0,0,0,0,1,0,2,0,0,0,3,0,4,0,0,0,5,0,0,0,0,0,6,0,7];
ulong kmax = (val - 2) / md + 1; // number of resgroups upto input value
ubyte[] prms = new ubyte[](kmax); // byte array of prime candidates init '0'
ulong sqrtN = cast(ulong) sqrt(cast(double) val); // integer sqrt of val
uint modk = 0; int r = -1; uint k = 0; // init residue parameters
// mark the multiples of the primes r1..sqrtN in 'prms'
while (true) {
++r; if (r == rscnt) {r = 0; modk += md; ++k;}
if ((prms[k] & (1 << r)) != 0) continue; // skip pc if not prime
uint prm_r = res[r]; // if prime save its residue value
ulong prime = modk + prm_r; // numerate the prime value
if (prime > sqrtN) break; // we're finished if it's > sqrtN
foreach (ri; res) { // mark prime's multiples in prms
uint prod = prm_r * ri - 2; // compute cross-product for prm_r|ri pair
ubyte bit_r = cast(ubyte) (1 << posn[prod % md]); // prod's residue bit mask
ulong kpm = (k * (prime + ri) + prod / md); // 1st resgroup for prime mult
while (kpm < kmax) { prms[kpm] |= bit_r; kpm += prime; }
}
}
// prms now contains the nonprime positions for the prime candidates r1..N
// extract only primes that are in inputs range into array 'primes'
primes = []; // create empty dynamic array for primes
foreach (kk, resgroup; prms) { // for each kth residue group
foreach (i, res_i; res) { // check for each ith residue in resgroup
if ((resgroup & (1 << i)) == 0) { // if bit location a prime
ulong prime = md * kk + res_i; // numerate its value, store if in range
// check if prime has multiple in range, if so keep it, if not don't
ulong n = start_num / prime; ulong rem = start_num % prime;
if (((res_0 <= prime) && (prime <= val)) && (prime * n <= end_num - prime || rem == 0)) { primes ~= prime; }
} } } }
/// Initialize 'nextp' array for given twin pair residues for thread.
/// Compute 1st prime multiple resgroups in range for each prime r1..sqrt(N)
/// and store consecutively as lo_tp|hi_tp pairs for their restracks.
ulong[] nextp_init(ulong hi_r, ulong k_min) {
ulong[] nextp = new ulong[](primes.length * 2); // 1st mults array for tp
ulong r_hi = hi_r; // upper twin pair value
ulong r_lo = hi_r - 2; // lower twin pair value
foreach(size_t j, prime; primes) { // for each prime r1..sqrt(N)
auto k = (prime - 2) / modpg; // find the resgroup it's in
auto r = (prime - 2) % modpg + 2; // and its residue value
auto r_inv = resinvrs[r]; // and its residue inverse
auto rl = (r_lo * r_inv - 2) % modpg + 2; // compute the ri for r for lo_cp
auto rh = (r_hi * r_inv - 2) % modpg + 2; // compute the ri for r for hi_tp
auto kl = k * (prime + rl) + (r * rl - 2) / modpg; // kl 1st mult resgroup
auto kh = k * (prime + rh) + (r * rh - 2) / modpg; // kh 1st mult resgroup
if (kl < k_min) { // if 1st mult index < start_num's
kl = (k_min - kl) % prime; // how many indices short is it
if (kl > 0) kl = prime - kl; // adjust index value into range
} else { kl = kl - k_min; } // else here, addjust index if it was >
if (kh < k_min) { // if 1st mult index < start_num's
kh = (k_min - kh) % prime; // how many indices short is it
if (kh > 0) kh = prime - kh; // adjust index value into range
} else { kh = kh - k_min; } // else here, addjust index if it was >
nextp[j * 2] = kl; // lo_cp index val >= start of range
nextp[j * 2 | 1] = kh; // hi_cp index val >= start of range
}
return nextp;
}
/// Perform in a thread, the ssoz for a given twinpair, for Kmax resgroups.
/// First create|init 'nextp' array of 1st prime mults for given twin pair,
/// (stored consequtively in 'nextp') and init seg byte array for Ks resgroups.
/// For sieve, mark resgroup bits to '1' if either twinpair restrack is nonprime,
/// for primes mults resgroups, and update 'nextp' restrack slices acccordingly.
/// Find last twin prime|sum for range, store in their arrays for this twinpair.
/// Can optionally compile to print mid twinprime values generated by twinpair.
void twinsSieve(uint indx) {
uint s = 6; // shift value for 64 bits
uint bmask = (1 << s) - 1; // bitmask val for 64 bits
auto sum = 0; // twins count for twin pair for segment
ulong Ki = Kmin - 1; // first resgroup val for range for slice
ulong K_max = Kmax; // last resgroup val for range for slice
uint Kn = Ks; // resgroup seg size for each seg slice
ulong hi_tp = 0; // value of largest tp hi prime in seg
ulong r_hi = restwins[indx]; // hi tp residue for this thread
ulong[] seg = new ulong[](((Ks-1) >> s) + 1); // seg array for Ks regroups
if (r_hi - 2 < (start_num - 2) % modpg + 2) ++Ki; // ensure lo tp in range
if (r_hi > (end_num - 2) % modpg + 2) --K_max; // ensure hi tp in range
ulong[] nextp = nextp_init(r_hi, Ki); // 1st prime mults for twin pair restracks
while (Ki < K_max) { // for Kn resgroup size slices upto Kmax
Kn = min(Ks, K_max - Ki); // set segment slice resgroup length
foreach (j, prime; primes) { // for each prime index r1..sqrt(N)
// for lower pair residue track
ulong k = nextp[j * 2]; // starting from this lower resgroup in seg
while (k < Kn) { // mark primenth resgroup bits prime mults
seg[k >> s] |= 1uL << (k & bmask);
k += prime; } // set next prime multiple resgroup
nextp[j * 2] = k - Kn; // save 1st resgroup in next eligible seg
// for upper pair residue track
k = nextp[j * 2 | 1]; // starting from this upper resgroup in seg
while (k < Kn) { // mark primenth resgroup prime mults
seg[k >> s] |= 1uL << (k & bmask);
k += prime; } // set next prime multiple resgroup
nextp[j * 2 | 1] = k - Kn; // save 1st resgroup in next eligible seg
} // set as nonprime unused bits in last seg[n]
// so fast, do for every seg[i]
seg[(Kn-1) >> s] = seg[(Kn-1) >> s] | (~1uL << ((Kn-1) & bmask));
auto cnt = 0; // count the cousinprimes in the segment
foreach (i; 0..((Kn-1) >> s) + 1) { cnt += popcnt(~seg[i]); }
if (cnt > 0) { // if segment has twin primes
sum += cnt; // add the segment count to total count
uint upk = Kn - 1; // from end of seg count backwards to largest tp
while ((seg[upk >> s] & (1uL << (upk & bmask))) != 0) --upk;
hi_tp = Ki + upk; // numerate its full resgroup value
}
Ki += Ks; // set 1st resgroup val of next seg slice
if (Ki < K_max) {foreach (b; 0..((Kn-1) >> s) + 1) seg[b] = 0;} // set seg all primes
}
// numerate largest twin prime in seg
hi_tp = ((r_hi > end_num) || (sum == 0)) ? 1 : hi_tp * modpg + r_hi;
lastwins[indx] = hi_tp; // store final seg tp value
cnts[indx] = sum; // sum for twin pair
}
/// Main routine to run twinprimes sieve, input ranges within 0..2^64 - 1.
void twinPrimesSsoz() {
ulong[] x;
foreach (_; 0 .. 2) { ulong a; readf!" %d"(a); x ~= a; }
end_num = max(x[0], 3);
start_num = max(x[1], 3);
if (start_num > end_num) swap(start_num, end_num);
start_num = start_num | 1; // if start_num even add 1
end_num = (end_num - 1) | 1; // if end_num even subtract 1
if (end_num - start_num < 2) { start_num = 7; end_num = 7; }
writeln("threads = ", totalCPUs);
auto stopWatchSetup = StopWatch();
stopWatchSetup.start(); // start timing sieve setup execution
setSieveParameters(start_num, end_num);
auto pairscnt = restwins.length;
auto krange = Kmax - Kmin + 1;
// create sieve primes <= sqrt(end_num), only use those whose multiples within inputs range
if (end_num < 49) primes ~= 5; // generate sieving primes
else sozPg(cast(ulong) sqrt(cast(double) end_num)); // <= sqrt(end_num)
writeln("each of ", pairscnt, " threads has nextp[", 2, " x ", primes.length, "] array");
ulong twinscnt = 0; // number of twin primes for range
auto lo_range = restwins[0] - 3; // lo_range = lo_tp - 1
foreach (tp; [3, 5, 11, 17]) { // excluded low tp values for PGs used
if (end_num == 3) break; // if 3 end of range, no twin primes
if ((start_num <= tp) && (tp < lo_range)) ++twinscnt; // count small tp if any
}
stopWatchSetup.stop(); // sieve setup time
writeln("setup time = ", stopWatchSetup.peek());
writeln("perform twinprimes ssoz sieve");
auto stopWatchExecution = StopWatch(); // start timing ssoz sieve execution
stopWatchExecution.start();
cnts = new uint[](pairscnt); // count of tps for each thread
lastwins = new ulong[](pairscnt); // largest hi_tp for each thread
shared uint threadscnt = 0; // count of finished threads
foreach (indx; parallel(iota(0, pairscnt))) { // do in parallel for each tp
twinsSieve(cast(uint) indx); // sieve selected twinpair restracks
write("\r", threadscnt.atomicOp!"+="(1), " of ", pairscnt, " twinpairs done");
}
write("\r", pairscnt, " of ", pairscnt, " twinpairs done");
ulong last_twin = 0uL;
foreach(i; 0..pairscnt) {
twinscnt += cnts[i];
if (last_twin < lastwins[i]) last_twin = lastwins[i];
}
if ((end_num == 5) && (twinscnt == 1)) last_twin = 5;
auto Kn = krange % Ks; // set num of resgroups in last slice
if (Kn == 0) Kn = Ks; // if mult of segsize set to seg size
stopWatchExecution.stop(); // sieve execution time
writeln("\nsieve time = ", stopWatchExecution.peek());
writeln("total time = ", stopWatchSetup.peek() + stopWatchExecution.peek());
writeln("last segment = ", Kn, " resgroups; segment slices = ", (krange-1) / Ks + 1);
writeln("total twins = ", twinscnt, "; last twin = ", last_twin - 1, "+/-1");
}
void main()
{ twinPrimesSsoz(); }
@veelo
Copy link

veelo commented Nov 11, 2018

Hi, just in case you didn't notice, I made a small contribution to the compile time part of this code: https://forum.dlang.org/post/hawlflxehtnklcvatggy@forum.dlang.org
I didn't cross-check the results with the Nim code, you might want to do that before publishing it.

Cheers,
Bastiaan.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment