Skip to content

Instantly share code, notes, and snippets.

@danaj
Last active November 29, 2021 17:09
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danaj/4385784 to your computer and use it in GitHub Desktop.
Save danaj/4385784 to your computer and use it in GitHub Desktop.
Prime counting utility using primesieve (http://code.google.com/p/primesieve/). Includes counting via sieve, Legendre, Meissel, and Lehmer methods. Part of the Math::Prime::Util Perl module (https://github.com/danaj/Math-Prime-Util). All code is in C, but primesieve makes a C++ library, hence the C++ compilation.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
/* Below this size, just sieve. */
#define SIEVE_LIMIT 1000000
/*****************************************************************************
*
* Lehmer prime counting utility. Calculates pi(x), count of primes <= x.
*
* Copyright (c) 2012-2013 Dana Jacobsen (dana@acm.org).
* This is free software; you can redistribute it and/or modify it under
* the same terms as the Perl 5 programming language system itself.
*
* This file is part of the Math::Prime::Util Perl module, but also can be
* compiled as a standalone UNIX program using the primesieve package.
*
* g++ -O3 -DPRIMESIEVE_STANDALONE lehmer.c -o prime_count -lprimesieve
*
* For faster prime counting in stage 4 with multiprocessor machines:
*
* g++ -O3 -DPRIMESIEVE_STANDALONE -DPRIMESIEVE_PARALLEL lehmer.c -o prime_count -lprimesieve -lgomp
*
* The phi(x,a) calculation is unique, to the best of my knowledge. It uses
* two lists of all x values + signed counts for the given 'a' value, and walks
* 'a' down until it is small enough to calculate directly (either with Mapes
* or using a calculated table using the primorial/totient method). This
* is relatively fast and low memory compared to many other solutions. As with
* all Lehmer-Meissel-Legendre algorithms, memory use will be a constraint
* with large values of x (see the table below).
*
* If you want something better, I highly recommend the paper "Computing
* Pi(x): the combinatorial method" (2006) by Tomás Oliveira e Silva. His
* implementation is certainly much faster and lower memory than this, but I
* have not seen any working source code for one of the LMO methods so it is
* difficult to compare.
*
* Using my sieve code with everything running in serial, calculating pi(10^12)
* is done under 1 second on my computer. pi(10^14) takes under 30 seconds,
* pi(10^16) in under 20 minutes. Compared with Thomas R. Nicely's pix4
* program, this one is 5x faster and uses 10x less memory. When compiled
* with parallel primesieve it is over 10x faster.
* pix4(10^16) takes 124 minutes, this code + primesieve takes < 4 minutes.
*
* Timings with Perl + MPU with all-serial computation. Using the standalone
* program with parallel primesieve speeds up stage 4 a lot for large values.
* The last column is the standalone time with 12-core parallel primesieve.
*
* n phi(x,a) mem/time | stage 4 mem/time | total time | pps time
* 10^19 1979.41 | ~13GB | | 7h 26m
* 10^18 5515MB 483.46 | 5390MB | | 87m 0s
* 10^17 1698MB 109.56 | 1568MB 9684.1 | 163m 36 s | 17m 37s
* 10^16 522MB 25.44 | 460MB 1066.3 | 18m 12 s | 3m 44s
* 10^15 159MB 5.86 | 137MB 141.2 | 2m 28 s | 48.17 s
* 10^14 48MB 1.34 | 41MB 22.55 | 23.58 s | 10.55 s
* 10^13 14MB 0.304 | 12MB 3.87 | 4.16 s | 2.40 s
* 10^12 4MB 0.070 | 4MB 0.716 | 0.78 s | 0.527
* 10^11 1MB 0.015 | 0.135 | 0.158s | 0.124s
* 10^10 0.003 | 0.029 | 0.028s | 0.036s
*
* Reference: Hans Riesel, "Prime Numbers and Computer Methods for
* Factorization", 2nd edition, 1994.
*/
static int const verbose = 0;
/* #define STAGE_TIMING 1 */
#ifdef STAGE_TIMING
#include <sys/time.h>
#define DECLARE_TIMING_VARIABLES struct timeval t0, t1;
#define TIMING_START gettimeofday(&t0, 0);
#define TIMING_END_PRINT(text) \
{ unsigned long long t; \
gettimeofday(&t1, 0); \
t = (t1.tv_sec-t0.tv_sec) * 1000000 + (t1.tv_usec - t0.tv_usec); \
printf("%s: %10.5f\n", text, ((double)t) / 1000000); }
#else
#define DECLARE_TIMING_VARIABLES
#define TIMING_START
#define TIMING_END_PRINT(text)
#endif
#ifdef PRIMESIEVE_STANDALONE
/* countPrimes seems to be pretty slow for small ranges, so sieve more small
* primes and count using binary search. Uses a lot of memory though. For
* big ranges, countPrimes is really fast. */
#define SIEVE_MULT 10
#include <limits.h>
#include <sys/time.h>
#ifdef PRIMESIEVE_PARALLEL
#include <primesieve/soe/ParallelPrimeSieve.h>
ParallelPrimeSieve ps;
#define SET_PPS_PARALLEL ps.setNumThreads(ParallelPrimeSieve::getMaxThreads())
#define SET_PPS_SERIAL ps.setNumThreads(1)
#else
#include <primesieve/soe/PrimeSieve.h>
PrimeSieve ps;
#define SET_PPS_PARALLEL /* */
#define SET_PPS_SERIAL /* */
#endif
/* Translations from Perl + Math::Prime::Util to C/C++ + primesieve */
typedef unsigned long UV;
typedef signed long IV;
#define UV_MAX ULONG_MAX
#define UVCONST(x) ((unsigned long)x##UL)
#define New(id, mem, size, type) mem = (type*) malloc((size)*sizeof(type))
#define Newz(id, mem, size, type) mem = (type*) calloc(size, sizeof(type))
#define Renew(mem, size, type) mem = (type*) realloc(mem,(size)*sizeof(type))
#define Safefree(mem) free((void*)mem)
#define _XS_prime_count(a, b) ps.countPrimes(a, b)
#define croak(fmt,...) { printf(fmt,##__VA_ARGS__); exit(1); }
#define prime_precalc(n) /* */
#define BITS_PER_WORD ((ULONG_MAX <= 4294967295UL) ? 32 : 64)
static UV isqrt(UV n)
{
UV root;
if (sizeof(UV) == 8 && n >= 18446744065119617025UL) return 4294967295UL;
if (sizeof(UV) == 4 && n >= 4294836225UL) return 65535UL;
root = (UV) sqrt((double)n);
while (root*root > n) root--;
while ((root+1)*(root+1) <= n) root++;
return root;
}
/* Callback used for creating an array of primes. */
static UV* sieve_array = 0;
static UV sieve_k;
static UV sieve_n;
void primesieve_callback(uint64_t pk) {
if (sieve_k <= sieve_n) {
if (pk < sieve_array[sieve_k-1])
croak("Primes generated out of order. Switch to serial mode.\n");
sieve_array[sieve_k++] = pk;
}
}
/* Generate an array of n small primes, where the kth prime is element p[k].
* Remember to free when done. */
static UV* generate_small_primes(UV n)
{
UV* primes;
double fn = (double)n;
double flogn = log(fn);
double flog2n = log(flogn);
UV nth_prime = /* Dusart 2010 for > 179k, custom for 18-179k */
(n >= 688383) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n-2.00)/flogn))) :
(n >= 178974) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n-1.95)/flogn))) :
(n >= 18) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n+0.30)/flogn)))
: 59;
New(0, primes, n+1, UV);
if (primes == 0)
croak("Can not allocate small primes\n");
primes[0] = 0;
sieve_array = primes;
sieve_n = n;
sieve_k = 1;
SET_PPS_SERIAL;
ps.generatePrimes(2, nth_prime, primesieve_callback);
SET_PPS_PARALLEL;
sieve_array = 0;
return primes;
}
#else
/* We will use pre-sieving to speed up counting for small ranges */
#define SIEVE_MULT 1
#include "lehmer.h"
#include "util.h"
#include "cache.h"
#include "sieve.h"
/* Generate an array of n small primes, where the kth prime is element p[k].
* Remember to free when done. */
static UV* generate_small_primes(UV n)
{
const unsigned char* sieve;
UV* primes;
UV i;
double fn = (double)n;
double flogn = log(fn);
double flog2n = log(flogn);
UV nth_prime = /* Dusart 2010 for > 179k, custom for 18-179k */
(n >= 688383) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n-2.00)/flogn))) :
(n >= 178974) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n-1.95)/flogn))) :
(n >= 18) ? (UV) ceil(fn*(flogn+flog2n-1.0+((flog2n+0.30)/flogn)))
: 59;
if (get_prime_cache(nth_prime, &sieve) < nth_prime) {
release_prime_cache(sieve);
croak("Could not generate sieve for %"UVuf, nth_prime);
}
New(0, primes, n+1, UV);
if (primes == 0)
croak("Can not allocate small primes\n");
primes[0] = 0; primes[1] = 2; primes[2] = 3; primes[3] = 5;
i = 3;
START_DO_FOR_EACH_SIEVE_PRIME( sieve, 7, nth_prime ) {
if (i >= n) break;
primes[++i] = p;
} END_DO_FOR_EACH_SIEVE_PRIME
release_prime_cache(sieve);
if (i < n)
croak("Did not generate enough small primes.\n");
if (verbose > 1) printf("generated %lu small primes, from 2 to %lu\n", i, primes[i]);
return primes;
}
#endif
static UV icbrt(UV n)
{
UV root = 0;
/* int s = BITS_PER_WORD - (BITS_PER_WORD % 3); */
#if BITS_PER_WORD == 32
int s = 30;
if (n >= UVCONST(4291015625)) return UVCONST(1625);
#else
int s = 63;
if (n >= UVCONST(18446724184312856125)) return UVCONST(2642245);
#endif
#if 0
/* The integer cube root code is about 30% faster for me */
root = (UV) pow(n, 1.0/3.0);
if (root*root*root > n) {
root--;
while (root*root*root > n) root--;
} else {
while ((root+1)*(root+1)*(root+1) <= n) root++;
}
#else
for ( ; s >= 0; s -= 3) {
UV b;
root += root;
b = 3*root*(root+1)+1;
if ((n >> s) >= b) {
n -= b << s;
root++;
}
}
#endif
return root;
}
/* Given an array of primes[1..lastprime], return Pi(n) where n <= lastprime.
* This is actually quite fast, and definitely faster than sieving. By using
* this we can avoid caching prime counts and also skip most calls to the
* segment siever.
*/
static UV bs_prime_count(UV n, UV const* const primes, UV lastprime)
{
UV i, j;
if (n < 2) return 0;
/* if (n > primes[lastprime]) return _XS_prime_count(2, n); */
if (n >= primes[lastprime]) {
if (n == primes[lastprime]) return lastprime;
croak("called bspc(%lu) with counts up to %lu\n", n, primes[lastprime]);
}
i = 1;
j = lastprime;
while (i < j) {
UV mid = (i+j)/2;
if (primes[mid] <= n) i = mid+1;
else j = mid;
}
return i-1;
}
/* Use Mapes' method to calculate phi(x,a) for small a. This is really
* convenient and a little Perl script will spit this code out for whatever
* limit we select. It gets unwieldy with large a values.
*/
static UV mapes(UV x, UV a)
{
IV val;
if (a == 0) return x;
if (a == 1) return x-x/2;
val = x-x/2-x/3+x/6;
if (a >= 3) val += 0-x/5+x/10+x/15-x/30;
if (a >= 4) val += 0-x/7+x/14+x/21-x/42+x/35-x/70-x/105+x/210;
if (a >= 5) val += 0-x/11+x/22+x/33-x/66+x/55-x/110-x/165+x/330+x/77-x/154-x/231+x/462-x/385+x/770+x/1155-x/2310;
if (a >= 6) val += 0-x/13+x/26+x/39-x/78+x/65-x/130-x/195+x/390+x/91-x/182-x/273+x/546-x/455+x/910+x/1365-x/2730+x/143-x/286-x/429+x/858-x/715+x/1430+x/2145-x/4290-x/1001+x/2002+x/3003-x/6006+x/5005-x/10010-x/15015+x/30030;
if (a >= 7) val += 0-x/17+x/34+x/51-x/102+x/85-x/170-x/255+x/510+x/119-x/238-x/357+x/714-x/595+x/1190+x/1785-x/3570+x/187-x/374-x/561+x/1122-x/935+x/1870+x/2805-x/5610-x/1309+x/2618+x/3927-x/7854+x/6545-x/13090-x/19635+x/39270+x/221-x/442-x/663+x/1326-x/1105+x/2210+x/3315-x/6630-x/1547+x/3094+x/4641-x/9282+x/7735-x/15470-x/23205+x/46410-x/2431+x/4862+x/7293-x/14586+x/12155-x/24310-x/36465+x/72930+x/17017-x/34034-x/51051+x/102102-x/85085+x/170170+x/255255-x/510510;
return (UV) val;
}
static UV mapes7(UV x) { /* A tiny bit faster setup for a=7 */
IV val = x-x/2-x/3-x/5+x/6-x/7+x/10-x/11-x/13+x/14+x/15-x/17+x/21+x/22+x/26
-x/30+x/33+x/34+x/35+x/39-x/42+x/51+x/55+x/65-x/66-x/70+x/77-x/78
+x/85+x/91-x/102-x/105-x/110+x/119-x/130+x/143-x/154-x/165-x/170
-x/182+x/187-x/195+x/210+x/221-x/231-x/238-x/255-x/273-x/286+x/330
-x/357-x/374-x/385+x/390-x/429-x/442-x/455+x/462+x/510+x/546-x/561
-x/595-x/663+x/714;
if (x >= 715) {
val += 0-x/715+x/770+x/858+x/910-x/935-x/1001-x/1105+x/1122+x/1155+x/1190
-x/1309+x/1326+x/1365+x/1430-x/1547+x/1785+x/1870+x/2002+x/2145
+x/2210-x/2310-x/2431+x/2618-x/2730+x/2805+x/3003+x/3094+x/3315
-x/3570+x/3927-x/4290+x/4641+x/4862+x/5005-x/5610-x/6006+x/6545
-x/6630+x/7293+x/7735-x/7854;
if (x >= 9282)
val += 0-x/9282-x/10010+x/12155-x/13090-x/14586-x/15015-x/15470+x/17017
-x/19635-x/23205-x/24310+x/30030-x/34034-x/36465+x/39270+x/46410
-x/51051+x/72930-x/85085+x/102102+x/170170+x/255255-x/510510;
}
return (UV) val;
}
/******************************************************************************/
/* In-order lists for manipulating our UV value / IV count pairs */
/******************************************************************************/
typedef struct {
UV v;
IV c;
} vc_t;
typedef struct {
vc_t* a;
UV size;
UV n;
} vcarray_t;
static vcarray_t vcarray_create(void)
{
vcarray_t l;
l.a = 0;
l.size = 0;
l.n = 0;
return l;
}
static void vcarray_destroy(vcarray_t* l)
{
if (l->a != 0) {
if (verbose > 2) printf("FREE list %p\n", l->a);
Safefree(l->a);
}
l->size = 0;
l->n = 0;
}
/* Insert a value/count pair. We do this indirection because about 80% of
* the calls result in a merge with the previous entry. */
static void vcarray_insert(vcarray_t* l, UV val, IV count)
{
UV n = l->n;
if (n > 0 && l->a[n-1].v < val)
croak("Previous value was %lu, inserting %lu out of order\n", l->a[n-1].v, val);
if (n >= l->size) {
UV new_size;
if (l->size == 0) {
new_size = 20000;
if (verbose>2) printf("ALLOCing list, size %lu\n", new_size);
New(0, l->a, new_size, vc_t);
} else {
new_size = (UV) (1.5 * l->size);
if (verbose>2) printf("REALLOCing list %p, new size %lu\n",l->a,new_size);
Renew( l->a, new_size, vc_t );
}
if (l->a == 0) croak("could not allocate list\n");
l->size = new_size;
}
/* printf(" inserting %lu %ld\n", val, count); */
l->a[n].v = val;
l->a[n].c = count;
l->n++;
}
/* Merge the two sorted lists A and B into A. Each list has no duplicates,
* but they may have duplications between the two. We're quite interested
* in saving memory, so first remove all the duplicates, then do an in-place
* merge. */
static void vcarray_merge(vcarray_t* a, vcarray_t* b)
{
long ai, bi, bj, k, kn;
long an = a->n;
long bn = b->n;
vc_t* aa = a->a;
vc_t* ba = b->a;
/* Merge anything in B that appears in A. */
for (ai = 0, bi = 0, bj = 0; bi < bn; bi++) {
/* Skip forward in A until empty or aa[ai].v <= ba[bi].v */
UV bval = ba[bi].v;
while (ai < an && aa[ai].v > bval)
ai++;
/* if A empty then copy the remaining elements */
if (ai >= an) {
if (bi == bj)
bj = bn;
else
while (bi < bn)
ba[bj++] = ba[bi++];
break;
}
if (aa[ai].v == bval)
aa[ai].c += ba[bi].c;
else
ba[bj++] = ba[bi];
}
if (verbose>2) printf(" removed %lu duplicates from b\n", bn - bj);
bn = bj;
if (bn == 0) { /* In case they were all duplicates */
b->n = 0;
return;
}
/* kn = the final merged size. All duplicates are gone, so this is exact. */
kn = an+bn;
if ((long)a->size < kn) { /* Make A big enough to hold kn elements */
UV new_size = (UV) (1.2 * kn);
if (verbose>2) printf("REALLOCing list %p, new size %lu\n", a->a, new_size);
Renew( a->a, new_size, vc_t );
aa = a->a; /* this could have been changed by the realloc */
a->size = new_size;
}
/* merge A and B. Very simple using reverse merge. */
ai = an-1;
bi = bn-1;
for (k = kn-1; k >= 0; k--) {
if (ai < 0) { /* A is exhausted, just filling in B */
if (bi < 0) croak("ran out of data during merge");
aa[k] = ba[bi--];
} else if (bi < 0) { /* We've caught up with A */
break;
} else if (aa[ai].v < ba[bi].v) {
aa[k] = aa[ai--];
} else {
if (aa[ai].v == ba[bi].v) croak("deduplication error");
aa[k] = ba[bi--];
}
}
a->n = kn; /* A now has this many items */
b->n = 0; /* B is marked empty */
}
/*
* The main phi(x,a) algorithm. In this implementation, it takes under 10%
* of the total time for the Lehmer algorithm, but is a big memory consumer.
*/
static UV phi(UV x, UV a)
{
UV i, val, sval;
UV sum = 0;
IV count;
const UV* primes;
vcarray_t a1, a2;
vc_t* arr;
if (a == 1) return ((x+1)/2);
if (a <= 7) return mapes(x, a);
primes = generate_small_primes(a+1);
if (primes == 0)
croak("Could not generate primes for phi(%lu,%lu)\n", x, a);
if (x < primes[a+1]) { Safefree(primes); return (x > 0) ? 1 : 0; }
a1 = vcarray_create();
a2 = vcarray_create();
vcarray_insert(&a1, x, 1);
while (a > 7) {
UV primea = primes[a];
UV sval_last = 0;
IV sval_count = 0;
arr = a1.a;
for (i = 0; i < a1.n; i++) {
count = arr[i].c;
if (count == 0) continue; /* Skip if count = 0 */
val = arr[i].v;
sval = val / primea;
if (sval < primea) break; /* stop inserting into a2 if small */
if (sval != sval_last) { /* non-merged value. Insert into a2 */
if (sval_last != 0)
vcarray_insert(&a2, sval_last, sval_count);
sval_last = sval;
sval_count = 0;
}
sval_count -= count; /* Accumulate count for this sval */
}
if (sval_last != 0) /* Insert the last sval */
vcarray_insert(&a2, sval_last, sval_count);
/* For each small sval, add up the counts */
for ( ; i < a1.n; i++)
sum -= arr[i].c;
/* Merge a1 and a2 into a1. a2 will be emptied. */
vcarray_merge(&a1, &a2);
a--;
}
vcarray_destroy(&a2);
if (a != 7) croak("final loop is set for a=7, a = %lu\n", a);
arr = a1.a;
for (i = 0; i < a1.n; i++) {
count = arr[i].c;
if (count != 0)
sum += count * mapes7( arr[i].v );
}
vcarray_destroy(&a1);
Safefree(primes);
return (UV) sum;
}
/* Legendre's method. Interesting and a good test for phi(x,a), but Lehmer's
* method is much faster (Legendre: a = pi(n^.5), Lehmer: a = pi(n^.25)) */
UV _XS_legendre_pi(UV n)
{
UV a;
if (n < SIEVE_LIMIT)
return _XS_prime_count(2, n);
a = _XS_legendre_pi(isqrt(n));
return phi(n, a) + a - 1;
}
/* Meissel's method. */
UV _XS_meissel_pi(UV n)
{
UV a, b, c, sum, i, lastprime, lastpc, lastw, lastwpc;
const UV* primes = 0; /* small prime cache */
DECLARE_TIMING_VARIABLES;
if (n < SIEVE_LIMIT)
return _XS_prime_count(2, n);
if (verbose > 0) printf("meissel %lu stage 1: calculate a,b,c \n", n);
TIMING_START;
a = _XS_meissel_pi(icbrt(n)); /* a = floor(n^1/3) */
b = _XS_meissel_pi(isqrt(n)); /* b = floor(n^1/2) */
c = a; /* c = a */
TIMING_END_PRINT("stage 1")
if (verbose > 0) printf("meissel %lu stage 2: phi(x,a) (a=%lu b=%lu c=%lu)\n", n, a, b, c);
TIMING_START;
sum = phi(n, a) + ((b+a-2) * (b-a+1) / 2);
if (verbose > 0) printf("phi(%lu,%lu) = %lu. sum = %lu\n", n, a, sum - ((b+a-2) * (b-a+1) / 2), sum);
TIMING_END_PRINT("phi(x,a)")
lastprime = b*SIEVE_MULT;
if (verbose > 0) printf("meissel %lu stage 3: %lu small primes\n", n, lastprime);
TIMING_START;
primes = generate_small_primes(lastprime);
if (primes == 0) croak("Error generating primes.\n");
lastpc = primes[lastprime];
TIMING_END_PRINT("small primes")
prime_precalc(isqrt(n / primes[a+1]));
prime_precalc( (UV) pow(n, 2.0/5.0) ); /* Sieve more for speed */
if (verbose > 0) printf("meissel %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]);
TIMING_START;
/* Reverse the i loop so w increases. Count w in segments. */
lastw = 0;
lastwpc = 0;
for (i = b; i > a; i--) {
UV w = n / primes[i];
lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastprime)
: lastwpc + _XS_prime_count(lastw+1, w);
lastw = w;
sum = sum - lastwpc;
}
TIMING_END_PRINT("stage 4")
Safefree(primes);
return sum;
}
/* Lehmer's method. This is basically Riesel's Lehmer function (page 22),
* with some additional code to help optimize it. */
UV _XS_lehmer_pi(UV n)
{
UV z, a, b, c, sum, i, j, lastprime, lastpc, lastw, lastwpc;
const UV* primes = 0; /* small prime cache, first b=pi(z)=pi(sqrt(n)) */
DECLARE_TIMING_VARIABLES;
if (n < SIEVE_LIMIT)
return _XS_prime_count(2, n);
/* Protect against overflow. 2^32-1 and 2^64-1 are both divisible by 3. */
if (n == UV_MAX) {
if ( (n%3) == 0 || (n%5) == 0 || (n%7) == 0 || (n%31) == 0 )
n--;
else
return _XS_prime_count(2,n);
}
if (verbose > 0) printf("lehmer %lu stage 1: calculate a,b,c \n", n);
TIMING_START;
z = isqrt(n);
a = _XS_lehmer_pi(isqrt(z)); /* a = floor(n^1/4) */
b = _XS_lehmer_pi(z); /* b = floor(n^1/2) */
c = _XS_lehmer_pi(icbrt(n)); /* c = floor(n^1/3) */
TIMING_END_PRINT("stage 1")
if (verbose > 0) printf("lehmer %lu stage 2: phi(x,a) (z=%lu a=%lu b=%lu c=%lu)\n", n, z, a, b, c);
TIMING_START;
sum = phi(n, a) + ((b+a-2) * (b-a+1) / 2);
TIMING_END_PRINT("phi(x,a)")
/* We get an array of the first b primes. This is used in stage 4. If we
* get more than necessary, we can use them to speed up some.
*/
lastprime = b*SIEVE_MULT;
if (verbose > 0) printf("lehmer %lu stage 3: %lu small primes\n", n, lastprime);
TIMING_START;
primes = generate_small_primes(lastprime);
if (primes == 0) croak("Error generating primes.\n");
lastpc = primes[lastprime];
TIMING_END_PRINT("small primes")
TIMING_START;
/* Speed up all the prime counts by doing a big base sieve */
prime_precalc( (UV) pow(n, 3.0/5.0) );
/* Ensure we have the base sieve for big prime_count ( n/primes[i] ). */
/* This is about 75k for n=10^13, 421k for n=10^15, 2.4M for n=10^17 */
prime_precalc(isqrt(n / primes[a+1]));
TIMING_END_PRINT("sieve precalc")
if (verbose > 0) printf("lehmer %lu stage 4: loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]);
TIMING_START;
/* Reverse the i loop so w increases. Count w in segments. */
lastw = 0;
lastwpc = 0;
for (i = b; i >= a+1; i--) {
UV w = n / primes[i];
lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastprime)
: lastwpc + _XS_prime_count(lastw+1, w);
lastw = w;
sum = sum - lastwpc;
if (i <= c) {
UV bi = bs_prime_count( isqrt(w), primes, lastprime );
for (j = i; j <= bi; j++) {
sum = sum - bs_prime_count(w / primes[j], primes, lastprime) + j - 1;
}
/* We could wrap the +j-1 in: sum += ((bi+1-i)*(bi+i))/2 - (bi-i+1); */
}
}
TIMING_END_PRINT("stage 4")
Safefree(primes);
return sum;
}
UV _XS_LMO_pi(UV n)
{
UV a, b, sum, i, lastprime, lastpc, lastw, lastwpc;
UV n13, n12, n23;
IV S1;
UV S2, P2;
const UV* primes = 0; /* small prime cache */
char* mu = 0; /* moebius to n^1/3 */
UV* lpf = 0; /* least prime factor to n^1/3 */
DECLARE_TIMING_VARIABLES;
if (n < SIEVE_LIMIT)
return _XS_prime_count(2, n);
if (verbose > 0) printf("LMO %lu stage 1: calculate pi(n^1/3) \n", n);
TIMING_START;
n13 = icbrt(n);
n12 = isqrt(n);
n23 = (UV) (pow(n, 2.0/3.0)+0.01);
a = _XS_lehmer_pi(n13);
b = _XS_lehmer_pi(n12);
TIMING_END_PRINT("stage 1")
lastprime = b*SIEVE_MULT;
if (verbose > 0) printf("LMO %lu stage 2: %lu small primes\n", n, lastprime);
TIMING_START;
primes = generate_small_primes(lastprime);
if (primes == 0) croak("Error generating primes.\n");
lastpc = primes[lastprime];
TIMING_END_PRINT("small primes")
if (verbose > 0) printf("LMO %lu stage 3: calculate mu/lpf to %lu\n", n, a);
TIMING_START;
/* We could call MPU's:
* mu = _moebius_range(0, n13+1)
* but (1) it's a bit slower (something to be addressed), and (2) we will
* do the least prime factor calculation at the same time.
*/
New(0, mu, n13+1, char);
memset(mu, 1, sizeof(char) * (n13+1));
New(0, lpf, n13+1, UV);
memset(lpf, 0, sizeof(UV) * (n13+1));
mu[0] = 0;
for (i = 1; i <= a; i++) {
UV primei = primes[i];
UV j, isquared;
for (j = primei; j <= n13; j += primei) {
mu[j] = -mu[j];
if (lpf[j] == 0) lpf[j] = primei;
}
isquared = primei * primei;
for (j = isquared; j <= n13; j += isquared)
mu[j] = 0;
}
/* for (i = 0; i <= n13; i++) { printf("mu %lu %ld\n", i, (IV)mu[i]); } */
TIMING_END_PRINT("mu")
if (verbose > 0) printf("LMO %lu stage 4: calculate S1 (%lu)\n", n, n13);
TIMING_START;
S1 = 0;
for (i = 1; i <= n13; i++)
if (mu[i] != 0)
S1 += mu[i] * (IV) (n/i);
TIMING_END_PRINT("S1")
if (verbose > 0) printf("LMO %lu stage 4: S1 = %ld\n", n, S1);
S2 = 0;
/* TODO... */
Safefree(mu);
Safefree(lpf);
prime_precalc(isqrt(n / primes[a+1]));
if (verbose > 0) printf("LMO %lu stage 5: P2 loop %lu to %lu, pc to %lu\n", n, a+1, b, n/primes[a+1]);
TIMING_START;
P2 = 0;
/* Reverse the i loop so w increases. Count w in segments. */
lastw = 0;
lastwpc = 0;
for (i = b; i > a; i--) {
UV w = n / primes[i];
lastwpc = (w <= lastpc) ? bs_prime_count(w, primes, lastprime)
: lastwpc + _XS_prime_count(lastw+1, w);
lastw = w;
P2 += lastwpc;
}
P2 -= ((b+a-2) * (b-a+1) / 2) - a + 1;
TIMING_END_PRINT("P2")
if (verbose > 0) printf("LMO %lu stage 5: P2 = %lu\n", n, P2);
Safefree(primes);
sum = P2 + S1 + S2;
return sum;
}
#ifdef PRIMESIEVE_STANDALONE
int main(int argc, char *argv[])
{
UV n, pi;
double t;
const char* method;
struct timeval t0, t1;
if (argc <= 1) { printf("usage: %s <n> [<method>]\n", argv[0]); return(1); }
n = strtoul(argv[1], 0, 10);
if (n < 2) { printf("Pi(%lu) = 0\n", n); return(0); }
if (argc > 2)
method = argv[2];
else
method = "lehmer";
gettimeofday(&t0, 0);
SET_PPS_PARALLEL;
if (!strcasecmp(method, "lehmer")) { pi = _XS_lehmer_pi(n); }
else if (!strcasecmp(method, "meissel")) { pi = _XS_meissel_pi(n); }
else if (!strcasecmp(method, "legendre")) { pi = _XS_legendre_pi(n); }
else if (!strcasecmp(method, "lmo")) { pi = _XS_LMO_pi(n); }
else if (!strcasecmp(method, "sieve")) { pi = _XS_prime_count(2, n); }
else {
printf("method must be one of: lehmer, meissel, legendre, lmo, or sieve\n");
return(2);
}
gettimeofday(&t1, 0);
t = (t1.tv_sec-t0.tv_sec); t *= 1000000.0; t += (t1.tv_usec - t0.tv_usec);
printf("%8s Pi(%lu) = %lu in %10.5fs\n", method, n, pi, t / 1000000.0);
return(0);
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment