Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Benchmark prime sieve speed between vector<char> and vector<bool>
>> g++ -std=c++20 -Ofast benchmark_prime_sieve.cpp && ./a.out
N = 30000000:
norm: bool: 106 ms
norm: char: 235 ms
norm: bool is 2.217 times FASTER
seg: bool: 147 ms
seg: char: 38 ms
seg: bool is 3.868 times SLOWER
N = 60000000:
norm: bool: 300 ms
norm: char: 527 ms
norm: bool is 1.757 times FASTER
seg: bool: 311 ms
seg: char: 87 ms
seg: bool is 3.575 times SLOWER
N = 90000000:
norm: bool: 498 ms
norm: char: 811 ms
norm: bool is 1.629 times FASTER
seg: bool: 488 ms
seg: char: 143 ms
seg: bool is 3.413 times SLOWER
N = 200000000:
norm: bool: 1305 ms
norm: char: 1987 ms
norm: bool is 1.523 times FASTER
seg: bool: 1164 ms
seg: char: 380 ms
seg: bool is 3.063 times SLOWER
N = 500000000:
norm: bool: 3731 ms
norm: char: 5210 ms
norm: bool is 1.396 times FASTER
seg: bool: 3268 ms
seg: char: 1214 ms
seg: bool is 2.692 times SLOWER
N = 1000000000:
norm: bool: 7868 ms
norm: char: 11066 ms
norm: bool is 1.406 times FASTER
seg: bool: 7273 ms
seg: char: 3039 ms
seg: bool is 2.393 times SLOWER
#include "bits/stdc++.h"
using namespace std;
template <typename T>
int count_primes_seg(int n) {
const int S = 10'000;
vector<int> primes;
int nsqrt = sqrt(n);
vector<T> is_prime(nsqrt + 1, true);
for (int i = 2; i <= nsqrt; i++) {
if (is_prime[i]) {
primes.push_back(i);
for (int j = i * i; j <= nsqrt; j += i)
is_prime[j] = false;
}
}
int result = 0;
vector<T> block(S);
for (int k = 0; k * S <= n; k++) {
fill(block.begin(), block.end(), true);
int start = k * S;
for (int p : primes) {
int start_idx = (start + p - 1) / p;
int j = max(start_idx, p) * p - start;
for (; j < S; j += p)
block[j] = false;
}
if (k == 0)
block[0] = block[1] = false;
for (int i = 0; i < S && start + i <= n; i++) {
if (block[i])
result++;
}
}
return result;
}
template <typename T>
int count_primes(int n) {
vector<T> is_prime(n+1, true);
is_prime[0] = is_prime[1] = false;
for (int i = 2; i * i <= n; i++) {
if (is_prime[i]) {
for (int j = i * i; j <= n; j += i)
is_prime[j] = false;
}
}
int cnt = 0;
for (int i = 2; i <= n; i++) {
if (is_prime[i])
cnt++;
}
return cnt;
/* return is_prime[n]; */
}
template <typename T>
long long benchmark(int n, string s)
{
auto t1 = std::chrono::high_resolution_clock::now();
int res = count_primes<T>(n);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
cout << " norm: " << s << ": " << duration.count() << " ms" << endl;
return duration.count();
}
template <typename T>
long long benchmark_seg(int n, string s)
{
auto t1 = std::chrono::high_resolution_clock::now();
int res = count_primes_seg<T>(n);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
cout << " seg: " << s << ": " << duration.count() << " ms" << endl;
return duration.count();
}
int main() {
ios_base::sync_with_stdio(false);
cin.tie(nullptr);
std::cout << std::fixed << std::setprecision(3);
for (int n : {3e7, 6e7, 9e7, 2e8, 5e8, 1e9}) {
cout << "N = " << n << ": " << endl;
{
long long a = benchmark<bool>(n, "bool");
long long b = benchmark<char>(n, "char");
cout << " norm: bool is ";
if (a < b) {
cout << ((double)b / a) << " times FASTER" << endl;
} else {
cout << ((double)a / b) << " times SLOWER" << endl;
}
}
{
long long a = benchmark_seg<bool>(n, "bool");
long long b = benchmark_seg<char>(n, "char");
cout << " seg: bool is ";
if (a < b) {
cout << ((double)b / a) << " times FASTER" << endl;
} else {
cout << ((double)a / b) << " times SLOWER" << endl;
}
}
cout << endl;
}
}
@jxu

This comment has been minimized.

Copy link

@jxu jxu commented Oct 16, 2021

Did you try with std::bitset?

@jakobkogler

This comment has been minimized.

Copy link
Owner Author

@jakobkogler jakobkogler commented Oct 17, 2021

@jxu
Just did a small experiment. Only for 1e9 though (as bitsets require the size to be known at compile time). Looks like std::bitset is a bit slower than std::vector<bool> during the normal sieve, but a bit faster during the segmented sieve. Not sure why.

N = 1000000000: 
    norm: vector<bool>:   7775 ms
    norm: vector<char>:  11992 ms
    norm: bitset:         9291 ms
    seg:  vector<bool>:   7305 ms
    seg:  vector<char>:   3110 ms
    seg:  bitset:         6501 ms

Maybe it's possible to optimize the bitset solution a little bit. I basically just replaced std::vector<T> v(size) with std::bitset<size> v in the code. But maybe there are some clever bitmask tricks that you can use.

@jakobkogler

This comment has been minimized.

Copy link
Owner Author

@jakobkogler jakobkogler commented Oct 17, 2021

For reference:

int count_primes_seg_bitset(int n) {
    if (n != 1000000000)
        return 0;
    constexpr int S = 10'000;

    vector<int> primes;
    constexpr int nsqrt = 31623;
    auto is_prime = std::move(bitset<nsqrt + 1>{}.set());
    for (int i = 2; i <= nsqrt; i++) {
        if (is_prime[i]) {
            primes.push_back(i);
            for (int j = i * i; j <= nsqrt; j += i)
                is_prime[j] = false;
        }
    }

    int result = 0;
    bitset<S> block;
    for (int k = 0; k * S <= n; k++) {
        block.set();
        int start = k * S;
        for (int p : primes) {
            int start_idx = (start + p - 1) / p;
            int j = max(start_idx, p) * p - start;
            for (; j < S; j += p)
                block[j] = false;
        }
        if (k == 0)
            block[0] = block[1] = false;
        for (int i = 0; i < S && start + i <= n; i++) {
            if (block[i])
                result++;
        }
    }
    return result;
}

int count_primes_bitset(int n) {
    if (n != 1000000000)
        return 0;
    auto is_prime = std::move(bitset<1000000000>{}.set());
    is_prime[0] = is_prime[1] = false;
    for (int i = 2; i * i <= n; i++) {
        if (is_prime[i]) {
            for (int j = i * i; j <= n; j += i)
                is_prime[j] = false;
        }
    }
    int cnt = 0;
    for (int i = 2; i <= n; i++) {
        if (is_prime[i])
            cnt++;
    }
    return cnt;
}
@jxu

This comment has been minimized.

Copy link

@jxu jxu commented Oct 17, 2021

The point of bitmask is not to need clever bitmask tricks I suppose, so if there's no significant difference then vector being dynamic makes it easier to use.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment