/// @file segmented_bit_sieve.cpp | |
/// @author Kim Walisch, <kim.walisch@gmail.com> | |
/// @brief This is an implementation of the segmented sieve of | |
/// Eratosthenes which uses a bit array with 16 numbers per | |
/// byte. It generates the primes below 10^10 in 7.25 seconds | |
/// on an Intel Core i7-6700 3.4 GHz CPU. | |
/// @license Public domain. | |
#include <iostream> | |
#include <algorithm> | |
#include <cmath> | |
#include <vector> | |
#include <cstdlib> | |
#include <stdint.h> | |
/// Set your CPU's L1 data cache size (in bytes) here | |
const int64_t L1D_CACHE_SIZE = 32768; | |
/// Bitmasks needed to unset bits corresponding to multiples | |
const int64_t unset_bit[16] = | |
{ | |
~(1 << 0), ~(1 << 0), | |
~(1 << 1), ~(1 << 1), | |
~(1 << 2), ~(1 << 2), | |
~(1 << 3), ~(1 << 3), | |
~(1 << 4), ~(1 << 4), | |
~(1 << 5), ~(1 << 5), | |
~(1 << 6), ~(1 << 6), | |
~(1 << 7), ~(1 << 7) | |
}; | |
const int64_t popcnt[256] = | |
{ | |
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, | |
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 | |
}; | |
/// Generate primes using the segmented sieve of Eratosthenes. | |
/// This algorithm uses O(n log log n) operations and O(sqrt(n)) space. | |
/// @param limit Sieve primes <= limit. | |
/// | |
void segmented_sieve(int64_t limit) | |
{ | |
int64_t sqrt = (int64_t) std::sqrt(limit); | |
int64_t sieve_size = std::max(sqrt, L1D_CACHE_SIZE); | |
int64_t segment_size = sieve_size * 16; | |
int64_t count = (limit == 1) ? -1 : 0; | |
// we sieve primes >= 3 | |
int64_t i = 3; | |
int64_t s = 3; | |
// vector used for sieving | |
std::vector<uint8_t> sieve(sieve_size); | |
std::vector<uint8_t> is_prime(sqrt + 1, true); | |
std::vector<int64_t> primes; | |
std::vector<int64_t> multiples; | |
for (int64_t low = 0; low <= limit; low += segment_size) | |
{ | |
std::fill(sieve.begin(), sieve.end(), 0xff); | |
// current segment = [low, high] | |
int64_t high = low + segment_size - 1; | |
high = std::min(high, limit); | |
sieve_size = (high - low) / 16 + 1; | |
// generate sieving primes using simple sieve of Eratosthenes | |
for (; i * i <= high; i += 2) | |
if (is_prime[i]) | |
for (int64_t j = i * i; j <= sqrt; j += i) | |
is_prime[j] = false; | |
// initialize sieving primes for segmented sieve | |
for (; s * s <= high; s += 2) | |
{ | |
if (is_prime[s]) | |
{ | |
primes.push_back(s); | |
multiples.push_back(s * s - low); | |
} | |
} | |
// sieve segment using bit array | |
for (std::size_t i = 0; i < primes.size(); i++) | |
{ | |
int64_t j = multiples[i]; | |
for (int64_t k = primes[i] * 2; j < segment_size; j += k) | |
sieve[j >> 4] &= unset_bit[j & 15]; | |
multiples[i] = j - segment_size; | |
} | |
// unset bits > limit | |
if (high == limit) | |
{ | |
int64_t bits = 0xff << (limit % 16 + 1) / 2; | |
sieve[sieve_size - 1] &= ~bits; | |
} | |
// count primes | |
for (int64_t n = 0; n < sieve_size; n++) | |
count += popcnt[sieve[n]]; | |
} | |
std::cout << count << " primes found." << std::endl; | |
} | |
/// Usage: ./segmented_sieve n | |
/// @param n Sieve the primes up to n. | |
/// | |
int main(int argc, char** argv) | |
{ | |
if (argc >= 2) | |
segmented_sieve(std::atoll(argv[1])); | |
else | |
segmented_sieve(1000000000); | |
return 0; | |
} |
I love your program, it is amazing. My best program can only sieve up_to 36e6 numbers in 0.9s < x < 1s
If I dont use the popcnt[256] can I implement by another way or even gen it some how ?
The popcnt lookup table returns the number of bits set in a byte e.g. for 2 it returns 1, for 7 it returns 3, ... Many compilers and or standard libraries have a special function for counting the number of bits set in a byte, int or long. E.g. GCC & Clang have __builtin_popcount(x)
, the MSVC compiler has _mm_popcnt_u64(x)
. If you are using another language it is best to check if such a function is available in your language. If such a function is not available you can use portable (but slow) code from: https://en.wikipedia.org/wiki/Hamming_weight
If you want to print the primes instead of counting them then you have to understand that each bit of the sieve array corresponds to an odd integer. So you could use code like this to print the primes:
for (int64_t n = 0; n < sieve_size; n++)
for (int i = 0; i < 8; i++)
if (sieve[n] & (1 << i))
std::cout << low + n * 16 + i * 2 + 1 << std::endl;
However iterating over the bits like this is probably slow and hence for printing primes I would recommend the simpler byte sieve from https://github.com/kimwalisch/primesieve/wiki/Segmented-sieve-of-Eratosthenes.
Thanks <3 You is still online <3 Thanks for replying <3
If I dont use the popcnt[256] can I implement by another way or even gen it some how ?
The popcnt lookup table returns the number of bits set in a byte e.g. for 2 it returns 1, for 7 it returns 3, ... Many compilers and or standard libraries have a special function for counting the number of bits set in a byte, int or long. E.g. GCC & Clang have
__builtin_popcount(x)
, the MSVC compiler has_mm_popcnt_u64(x)
. If you are using another language it is best to check if such a function is available in your language. If such a function is not available you can use portable (but slow) code from: https://en.wikipedia.org/wiki/Hamming_weight
Oh, so I can presolve popcnt[256]
by this way ?
for (int i = 0; i < 256; ++i)
popcnt[i] = __builtin_popcount(i);
Can I use these loops out of the main for(low = 0 - > lim)
loops ?
// generate sieving primes using simple sieve of Eratosthenes
for (int32_t i = 3; i * i <= high; i += 2)
if (is_prime[i])
for (int64_t j = i * i; j <= sqrt; j += i)
is_prime[j] = false;
// initialize sieving primes for segmented sieve
for (int32_t s = 3; s * s <= high; s += 2)
{
if (is_prime[s])
{
primes.push_back(s);
multiples.push_back(s * s);
}
}
Oh, so I can presolve popcnt[256] by this way ?
for (int i = 0; i < 256; ++i)
popcnt[i] = __builtin_popcount(i);
Yes! Or even better you can simply use:
// count primes
for (int64_t n = 0; n < sieve_size; n++)
count += __builtin_popcount(sieve[n]);
How can I get all primes into a vector ?
Oh, so I can presolve popcnt[256] by this way ?
for (int i = 0; i < 256; ++i) popcnt[i] = __builtin_popcount(i);
Yes! Or even better you can simply use:
// count primes for (int64_t n = 0; n < sieve_size; n++) count += __builtin_popcount(sieve[n]);
Thanks for replying
Can I use these loops out of the main
for(low = 0 - > lim)
loops ?// generate sieving primes using simple sieve of Eratosthenes for (int32_t i = 3; i * i <= high; i += 2) if (is_prime[i]) for (int64_t j = i * i; j <= sqrt; j += i) is_prime[j] = false; // initialize sieving primes for segmented sieve for (int32_t s = 3; s * s <= high; s += 2) { if (is_prime[s]) { primes.push_back(s); multiples.push_back(s * s); } }
If I can, should I do that ? And why did you uses these loops inside
Can I use these loops out of the main for(low = 0 - > lim) loops ?
// generate sieving primes using simple sieve of Eratosthenes
for (int32_t i = 3; i * i <= high; i += 2)
if (is_prime[i])
for (int64_t j = i * i; j <= sqrt; j += i)
is_prime[j] = false;
// initialize sieving primes for segmented sieve
for (int32_t s = 3; s * s <= high; s += 2)
{
if (is_prime[s])
{
primes.push_back(s);
multiples.push_back(s * s);
}
}
You can only move the first loop out of the main loop, but you need to modify it a little:
// Change i * i <= high to i * i <= sqrt
for (int64_t i = 3; i * i <= sqrt; i += 2)
if (is_prime[i])
for (int64_t j = i * i; j <= sqrt; j += i)
is_prime[j] = false;
How can I get all primes into a vector ?
I think this should work, but I have not tested it.
for (int64_t n = 0; n < sieve_size; n++)
for (int i = 0; i < 8; i++)
if (sieve[n] & (1 << i))
all_primes.push_back(low + n * 16 + i * 2 + 1);
How can I get all primes into a vector ?
I think this should work, but I have not tested it.
for (int64_t n = 0; n < sieve_size; n++) for (int i = 0; i < 8; i++) if (sieve[n] & (1 << i)) all_primes.push_back(low + n * 16 + i * 2 + 1);
Thanks and sorry for asking such silly questions, I think I have learn and research too hard that my cognitive is bad now. I should have take a rest. See you later, have a good day <3
Thank you very much <3
How can I get all primes into a vector ?
I think this should work, but I have not tested it.
for (int64_t n = 0; n < sieve_size; n++) for (int i = 0; i < 8; i++) if (sieve[n] & (1 << i)) all_primes.push_back(low + n * 16 + i * 2 + 1);
I tested for many problems but non of them failed.
I find that if we only care for
constexpr int64_t offset[] = {2, 4};
...
boolt ts = false;
...
void segmented_sieve(int64_t limit)
{
...
for (; s * s <= high; s += offset[ts ^= 1])
{
if (is_prime[s])
{
primes.push_back(s);
multiples.push_back(s * s - low);
}
}
...
}
Also I think you would have known: on small scale of arround 2^32, using static array with above adding method the code would run twice as fast as original (tested on codeforces, ideone)
If I dont use the
popcnt[256]
can I implement by another way or even gen it some how ?