Skip to content

Instantly share code, notes, and snippets.

@bortkiewicz
Created May 22, 2020 06:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bortkiewicz/778299fafb02711ffaea1ca35210d6d7 to your computer and use it in GitHub Desktop.
Save bortkiewicz/778299fafb02711ffaea1ca35210d6d7 to your computer and use it in GitHub Desktop.
bithybrid
#pragma GCC target("avx512f,avx512bw")
// writer: w33z8kqrqk8zzzx33
#include <bits/stdc++.h>
#include <immintrin.h>
using namespace std;
// begin fast read template by CYJian (source: https://www.luogu.com.cn/paste/i11c3ppx)
namespace io {
const int __SIZE = (1 << 25) + 1;
char ibuf[__SIZE], *iS, *iT, obuf[__SIZE], *oS = obuf, *oT = oS + __SIZE - 1, __c, qu[55]; int __f, qr, _eof;
#define Gc() (iS == iT ? (iT = (iS = ibuf) + fread (ibuf, 1, __SIZE, stdin), (iS == iT ? EOF : *iS ++)) : *iS ++)
inline void flush () { fwrite (obuf, 1, oS - obuf, stdout), oS = obuf; }
inline void reverse() { iS = ibuf; }
inline void gc (char &x) { x = Gc(); }
inline void pc (char x) { *oS ++ = x; if (oS == oT) flush (); }
inline void pstr (const char *s) { int __len = strlen(s); for (__f = 0; __f < __len; ++__f) pc (s[__f]); }
inline void gstr (char *s) { for(__c = Gc(); __c < 32 || __c > 126 || __c == ' ';) __c = Gc();
for(; __c > 31 && __c < 127 && __c != ' '; ++s, __c = Gc()) *s = __c; *s = 0; }
template <class I> inline bool gi (I &x) { _eof = 0;
for (__f = 1, __c = Gc(); (__c < '0' || __c > '9') && !_eof; __c = Gc()) { if (__c == '-') __f = -1; _eof |= __c == EOF; }
for (x = 0; __c <= '9' && __c >= '0' && !_eof; __c = Gc()) x = x * 10 + (__c & 15), _eof |= __c == EOF; x *= __f; return !_eof; }
template <class I> inline void print (I x) { if (!x) pc ('0'); if (x < 0) pc ('-'), x = -x;
while (x) qu[++ qr] = x % 10 + '0', x /= 10; while (qr) pc (qu[qr --]); }
struct Flusher_ {~Flusher_(){flush();}}io_flusher_;
} using io::pc; using io::gc; using io::pstr; using io::gstr; using io::gi; using io::print;
// end fast read template by CYJian
#define iter(i, a, b) for(int i=(a); i<(b); i++)
#define reti(i, a, b) for(int i=(b)-1; i>=(a); i--)
//#define rep(i, a) iter(i, 0, a)
#define rep1(i, a) iter(i, 1, (a)+1)
#define all(a) a.begin(), a.end()
#define fi first
#define se second
#define pb push_back
using ll=long long;
namespace Par {
#define rep(i, from, to) for (int i = from; i < (to); ++i)
#define trav(a, x) for (auto& a : x)
#define sz(x) (int)(x).size()
typedef long long ll;
typedef pair<int, int> pii;
typedef vector<int> vi;
#ifdef USE_AVX256
typedef __m256i M;
#define MM(op) _mm256_ ## op
#define MM_ANDNOT(a,b) _mm256_andnot_si256(a,b)
#define ZERO MM(setzero_si256)()
#define MM_SHUFFLE_MASK(a,b,c,d) \
_mm256_set_epi32(a,b,c,d, a,b,c,d)
#define M_WORDS 4
#else
typedef __m512i M;
#define MM(op) _mm512_ ## op
#define MM_ANDNOT(a,b) _mm512_andnot_si512(a,b)
#define ZERO MM(setzero_si512)()
#define MM_SHUFFLE_MASK(a,b,c,d) \
_mm512_set_epi32(a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d)
#define M_WORDS 8
#endif
#define ONE MM(set1_epi32)(-1)
#define M_BITS (M_WORDS * 64)
union U {
M m;
uint64_t words[M_WORDS];
uint16_t shorts[4*M_WORDS];
};
// m >> 1
U srl1(U u) {
rep(i,0,M_WORDS-1)
u.words[i] = (u.words[i] >> 1) | (u.words[i + 1] << 63);
u.words[M_WORDS-1] >>= 1;
return u;
}
// m << 1
U sll1(U u) {
for (int i = M_WORDS-1; i >= 1; i--)
u.words[i] = (u.words[i] << 1) | (u.words[i - 1] >> 63);
u.words[0] <<= 1;
return u;
}
void setbit(U& u, unsigned bit) {
u.words[bit >> 6] |= 1ULL << (bit & 63);
}
bool getbit(U& u, unsigned bit) {
return u.words[bit >> 6] & (1ULL << (bit & 63));
}
ostream& operator<<(ostream& os, U u) {
rep(i,0,M_BITS) os << (getbit(u, i) ? '1' : '0');
return os;
}
template <class F>
void generic_for_each(F&& f) {}
template <class F, class Arg, class... Args>
void generic_for_each(F&& f, Arg& arg, Args&... args) {
f(arg);
generic_for_each(f, args...);
}
struct State {
int a, b;
int evs[8];
int evi;
int pos;
int tp;
M prev, acc;
M affected, naffected;
M ones, twos, fours, eights;
};
struct Temp {
// Persistent state from State
M prev, acc, affected, naffected;
M ones, twos, fours, eights;
// Temporary state
M cur, next, local;
Temp() {}
Temp(State& s) :
prev(s.prev), acc(s.acc),
affected(s.affected), naffected(s.naffected),
ones(s.ones), twos(s.twos), fours(s.fours), eights(s.eights)
{}
void writeBack(State& s) {
s.prev = prev;
s.acc = acc;
s.affected = affected;
s.naffected = naffected;
s.ones = ones;
s.twos = twos;
s.fours = fours;
s.eights = eights;
}
};
struct Op {
enum { IS_POPCNT = 0, NEED_NEXT = 0 };
};
struct Op1 : Op {
void handle(Temp& t) {
t.cur &= t.naffected;
}
};
struct Op2 : Op {
void handle(Temp& t) {
t.cur |= t.affected;
}
};
struct Op3 : Op {
enum { NEED_NEXT = 1 };
void handle(Temp& t) {
t.cur |= t.next & t.affected;
}
};
struct Op4 : Op {
void handle(Temp& t) {
M old = t.cur;
t.cur |= t.prev & t.affected;
t.prev = old;
}
};
struct Op5 : Op {
enum { NEED_NEXT = 1 };
void handle(Temp& t) {
t.cur &= t.next | t.naffected;
}
};
struct Op6 : Op {
void handle(Temp& t) {
M old = t.cur;
t.cur &= t.prev | t.naffected;
t.prev = old;
}
};
// Harley-Seal popcount from http://0x80.pl/articles/sse-popcount.html
void CSA(M& h, M& a, M b, M c) {
#ifdef USE_AVX256
const M u = a ^ b;
h = (a & b) | (u & c);
a = u ^ c;
#else
M a2 = a;
a = _mm512_ternarylogic_epi32(c, b, a2, 0x96);
h = _mm512_ternarylogic_epi32(c, b, a2, 0xe8);
#endif
}
M popcount(const M v) {
const M m1 = MM(set1_epi8)(0x55);
const M m2 = MM(set1_epi8)(0x33);
const M m4 = MM(set1_epi8)(0x0F);
const M t1 = MM(sub_epi8)(v, (MM(srli_epi16)(v, 1) & m1));
const M t2 = MM(add_epi8)(t1 & m2, (MM(srli_epi16)(t1, 2) & m2));
const M t3 = MM(add_epi8)(t2, MM(srli_epi16)(t2, 4)) & m4;
return MM(sad_epu8)(t3, ZERO);
}
struct Op7 : Op {
enum { IS_POPCNT = 1 };
void handle16(Temp& t, U* data) {
M twosA, twosB, foursA, foursB, eightsA, eightsB, sixteens;
CSA(twosA, t.ones, data[0].m & t.affected, data[1].m & t.affected);
CSA(twosB, t.ones, data[2].m & t.affected, data[3].m & t.affected);
CSA(foursA, t.twos, twosA, twosB);
CSA(twosA, t.ones, data[4].m & t.affected, data[5].m & t.affected);
CSA(twosB, t.ones, data[6].m & t.affected, data[7].m & t.affected);
CSA(foursB, t.twos, twosA, twosB);
CSA(eightsA, t.fours, foursA, foursB);
CSA(twosA, t.ones, data[8].m & t.affected, data[9].m & t.affected);
CSA(twosB, t.ones, data[10].m & t.affected, data[11].m & t.affected);
CSA(foursA, t.twos, twosA, twosB);
CSA(twosA, t.ones, data[12].m & t.affected, data[13].m & t.affected);
CSA(twosB, t.ones, data[14].m & t.affected, data[15].m & t.affected);
CSA(foursB, t.twos, twosA, twosB);
CSA(eightsB, t.fours, foursA, foursB);
CSA(sixteens, t.eights, eightsA, eightsB);
t.acc = MM(add_epi64)(t.acc,
MM(slli_epi64)(popcount(sixteens), 4));
}
// pshufb-based popcnt, see http://0x80.pl/articles/sse-popcount.html
void handle(Temp& t) {
const M lowMask = MM(set1_epi8)(0xf);
const M popcntLookup = MM_SHUFFLE_MASK(
0x04030302,
0x03020201,
0x03020201,
0x02010100
);
const M vec = t.cur & t.affected;
const M lo = vec & lowMask;
const M hi = MM(srli_epi16)(vec, 4) & lowMask;
const M popcnt1 = MM(shuffle_epi8)(popcntLookup, lo);
const M popcnt2 = MM(shuffle_epi8)(popcntLookup, hi);
t.local = MM(add_epi8)(t.local, popcnt1);
t.local = MM(add_epi8)(t.local, popcnt2);
}
void handle8th(Temp& t) {
t.acc = MM(add_epi64)(t.acc, MM(sad_epu8)(t.local, ZERO));
}
};
template<class F>
void withOp(int tp, F&& f) {
switch (tp) {
case 1: f(Op1()); break;
case 2: f(Op2()); break;
case 3: f(Op3()); break;
case 4: f(Op4()); break;
case 5: f(Op5()); break;
case 6: f(Op6()); break;
case 7: f(Op7()); break;
default: assert(0);
}
}
constexpr int pad = 2;
constexpr int PREFIX_PAD = 8;
State states[pad];
U prefixes[M_BITS+1 + PREFIX_PAD*2];
U* blocks;
int nblocks;
int nops;
void computeAffected(State& s) {
int at = s.pos;
if (at < s.evs[s.evi])
return;
while (at >= s.evs[s.evi])
s.evi++;
// Compute mask for which of the entries corresponding to bits
// 0..63 should be updated when iterating over the range
// [s.evs[s.evi-1], s.evs[s.evi]). Because of how "evs" was
// constructed, this will be constant over that range (if we
// interpret infinities correctly) and we can sample a random
// point "at" in the interval for computing it.
//
// The mask should have bit i set if:
// a <= i * nblocks + at < b
// (a - at) / nblocks <= i < (b - at) / nblocks
// ceil((a - at) / nblocks) <= i < ceil((b - at) / nblocks)
//
// We assume this range is contained in [-PREFIX_PAD, M_BITS + PREFIX_PAD].
auto ceildiv = [](int a) {
// a can be negative, causing a badly rounded division, but not by very
// much. Compensate by adding PREFIX_PAD*b.
return (a + PREFIX_PAD*nblocks + nblocks-1) / nblocks - PREFIX_PAD;
};
int low = ceildiv(s.a - at);
int high = ceildiv(s.b - at);
assert(low <= high);
assert(-PREFIX_PAD <= low);
assert(high <= M_BITS + PREFIX_PAD);
s.affected = MM_ANDNOT(prefixes[PREFIX_PAD + low].m, prefixes[PREFIX_PAD + high].m);
s.naffected = MM_ANDNOT(s.affected, ONE);
}
void advanceOne(State& s, int goal) {
int at = s.pos;
assert(at < goal);
computeAffected(s);
int from = at, to = min(goal, s.evs[s.evi]);
assert(from < to);
Temp t(s);
withOp(s.tp, [&](auto&& op) {
rep(i,from,to) {
t.local = ZERO;
t.cur = blocks[i].m;
if constexpr (op.NEED_NEXT) t.next = blocks[i+1].m;
op.handle(t);
if constexpr (op.IS_POPCNT) op.handle8th(t);
else blocks[i].m = t.cur;
}
});
t.writeBack(s);
s.pos = to;
}
void advance(State& s, int goal) {
while (s.pos != goal)
advanceOne(s, goal);
}
template<class... Ops>
void runLockstepLoop(int start, int goal, Ops&... ops) {
constexpr int nops = (int) sizeof...(ops);
assert(start < goal);
Temp temps[nops];
rep(i,0,nops) temps[i] = Temp(states[i]);
rep(i,0,nops) assert(states[i].pos == start + (nops-1 - i));
const int CHUNK = 16;
int i = start;
for (; i + CHUNK <= goal; i += CHUNK) {
int ind = 0;
generic_for_each([&](auto&& op) {
Temp& t = temps[ind];
if constexpr (op.IS_POPCNT) {
op.handle16(t, &blocks[i + (nops-1 - ind)]);
} else {
rep(it,0,CHUNK) {
int j = i + it + (nops-1 - ind);
t.cur = blocks[j].m;
if constexpr (op.NEED_NEXT) t.next = blocks[j+1].m;
op.handle(t);
blocks[j].m = t.cur;
}
}
ind++;
}, ops...);
}
rep(j,0,nops) temps[j].local = ZERO;
for (; i < goal; ++i) {
int ind = 0;
generic_for_each([&](auto&& op) {
int j = i + (nops-1 - ind);
Temp& t = temps[ind];
t.cur = blocks[j].m;
if constexpr (op.NEED_NEXT) t.next = blocks[j+1].m;
op.handle(t);
if constexpr (!op.IS_POPCNT) blocks[j].m = t.cur;
ind++;
}, ops...);
}
{
int ind = 0;
generic_for_each([&](auto&& op) {
if constexpr (op.IS_POPCNT) op.handle8th(temps[ind]);
ind++;
}, ops...);
}
rep(i,0,nops) temps[i].writeBack(states[i]);
rep(i,0,nops) states[i].pos = goal + (nops-1 - i);
}
template<class... Ops>
void loopLockstep(int goal, Ops&... ops) {
constexpr int nops = sizeof...(ops);
if constexpr (nops == 0) return;
int pos = states[nops-1].pos;
while (pos < goal) {
int to = goal;
rep(i,0,nops) {
computeAffected(states[i]);
to = min(to, states[i].evs[states[i].evi] - (nops-1 - i));
}
runLockstepLoop(pos, to, ops...);
pos = to;
}
}
template<class... Ops>
void recLockstep(int goal, Ops&&... ops) {
constexpr int i = sizeof...(ops);
if constexpr (i > pad) {
assert(0);
} else if (i == nops) {
loopLockstep(goal, ops...);
} else if constexpr (i < pad) {
withOp(states[i].tp, [&](auto&& op) {
recLockstep(goal, ops..., op);
});
} else {
assert(0);
}
}
int main() {
// Given a bit array A of size N and Q operations of the types:
// 1. for every i in [a, b), set A[i] = 0
// 2. for every i in [a, b), set A[i] = 1
// 3. for every i in [a, b) set A[i] = A[i] | A[i+1]
// 4. for every i in [a, b) set A[i] = A[i] | A[i-1]
// 5. for every i in [a, b) set A[i] = A[i] & A[i+1]
// 6. for every i in [a, b) set A[i] = A[i] & A[i-1]
// 7. compute popcount of [a, b)
// Computes the array A after all operations have been performed.
int N, Q;
gi(N), gi(Q);
nblocks = max((N + M_BITS-1) / M_BITS, pad * 2 + 5);
prefixes[0+PREFIX_PAD].m = ZERO;
rep(i,0,M_BITS) {
prefixes[i+1+PREFIX_PAD] = prefixes[i+PREFIX_PAD];
setbit(prefixes[i+1+PREFIX_PAD], i);
}
rep(i,0,PREFIX_PAD) prefixes[i].m = ZERO;
rep(i,0,PREFIX_PAD) prefixes[PREFIX_PAD+M_BITS+1 + i].m = ONE;
// Order the bits so that block i -- a 256-bit AVX vector -- consists of
// bits nblocks*j + i for j = 0..255. With that representation, shifting
// left/right by one bit becomes shifting left/right by one block, except
// at the edges. Picture: (with W = nblocks)
//
// 0 | 1 | 2 | 3 | 4 | ... | W-1
// W | W+1 | W+2 | W+3 | W+4 | | 2*W-1
// ...
// 255*W | ... | | N-1 | 0 | ... | 0
// blocks[0] | blocks[1] | blocks[2] | | | ... | blocks[W-1]
blocks = (U*)_mm_malloc((nblocks + 2 * pad) * sizeof(U), sizeof(U)) + pad;
rep(i,0,N) {
int k; gi(k);
int bit = i / nblocks, block = i % nblocks;
assert(bit < M_BITS);
if (k) setbit(blocks[block], bit);
}
const M one = ONE;
// Q = min(Q, 500'000);
int qi = 0;
while (qi < Q) {
nops = min(Q - qi, pad);
qi += nops;
// Explicitly compute the edge wrap-around and store it as padding
// around the actual blocks.
rep(i,0,pad) {
blocks[i - pad] = sll1(blocks[nblocks + i - pad]);
blocks[nblocks + i] = srl1(blocks[i]);
}
rep(i,0,nops) {
int tp, a, b;
gi(tp), gi(a), gi(b);
--a;
if (tp == 3 || tp == 5) b--;
if (tp == 4 || tp == 6) a++;
assert(0 <= a && a <= b && b <= N);
if (tp == 3 || tp == 5) assert(b < N);
if (tp == 4 || tp == 6) assert(a > 0);
// tp = 7;
State& s = states[i];
s.tp = tp;
s.a = a;
s.b = b;
s.prev = ZERO;
s.acc = ZERO;
s.affected = ZERO;
s.naffected = one;
s.ones = ZERO;
s.twos = ZERO;
s.fours = ZERO;
s.eights = ZERO;
// One too much, just to compute "prev". It only results in unused
// padding being incorrect.
s.pos = tp == 7 ? 0 : -nops + i;
s.evi = 0;
int* evs = s.evs;
evs[0] = INT_MIN;
evs[1] = 0;
evs[2] = a % nblocks;
evs[3] = b % nblocks;
evs[4] = nblocks;
int nevs = 5;
for (int x : {evs[2], evs[3]}) for (int delta : {-nblocks, nblocks}) {
int y = x + delta;
if (-pad <= y && y <= nblocks + pad) evs[nevs++] = y;
}
assert(nevs <= 7);
evs[nevs++] = INT_MAX;
sort(evs + 1, evs + nevs - 1);
}
// Initial part
rep(i,0,nops) {
advance(states[i], nops-1 - i);
}
// Middle part, in lock-step, until where states with tp == 7 need to stop
int goal = nblocks - nops;
recLockstep(goal);
// Final part
rep(i,0,nops) {
int to = nblocks + (states[i].tp == 7 ? 0 : nops-1 - i);
advance(states[i], to);
}
rep(i,0,nops) if (states[i].tp == 7) {
State& s = states[i];
M acc = s.acc;
acc = MM(add_epi64)(acc, MM(slli_epi64)(popcount(s.eights), 3));
acc = MM(add_epi64)(acc, MM(slli_epi64)(popcount(s.fours), 2));
acc = MM(add_epi64)(acc, MM(slli_epi64)(popcount(s.twos), 1));
acc = MM(add_epi64)(acc, popcount(s.ones));
ll sum = 0;
U u;
u.m = acc;
rep(j,0,M_WORDS) sum += u.words[j];
printf("%d\n", (int)sum);
}
}
#ifdef PRINT_FINAL
rep(i,0,N) {
int bit = i / nblocks, block = i % nblocks;
assert(bit < M_BITS);
putchar(getbit(blocks[block], bit) ? '1' : '0');
}
putchar('\n');
#endif
fflush(stdout);
return 0;
}
}
namespace Lin {
using M=__m512i;
union U { M m; uint64_t words[8]; };
inline void setbit(U& u, const unsigned& bit) { u.words[bit>>6] |= 1ull<<(bit&63);}
void RR(U& u) {
u.words[0] = (u.words[0]>>1) | (u.words[1]<<63);
u.words[1] = (u.words[1]>>1) | (u.words[2]<<63);
u.words[2] = (u.words[2]>>1) | (u.words[3]<<63);
u.words[3] = (u.words[3]>>1) | (u.words[4]<<63);
u.words[4] = (u.words[4]>>1) | (u.words[5]<<63);
u.words[5] = (u.words[5]>>1) | (u.words[6]<<63);
u.words[6] = (u.words[6]>>1) | (u.words[7]<<63);
u.words[7] = u.words[7]>>1;
}
void LL(U& u) {
u.words[7] = (u.words[7]<<1) | (u.words[6]>>63);
u.words[6] = (u.words[6]<<1) | (u.words[5]>>63);
u.words[5] = (u.words[5]<<1) | (u.words[4]>>63);
u.words[4] = (u.words[4]<<1) | (u.words[3]>>63);
u.words[3] = (u.words[3]<<1) | (u.words[2]>>63);
u.words[2] = (u.words[2]<<1) | (u.words[1]>>63);
u.words[1] = (u.words[1]<<1) | (u.words[0]>>63);
u.words[0] = u.words[0]<<1;
}
int opt, l, r, sum;
U* blocks;
U prefixes[513];
int nblocks;
namespace SimdSum {
struct sse_vector final {
__m128i v;
sse_vector() = delete;
sse_vector(sse_vector&) = delete;
explicit sse_vector(const __m128i& vec): v(vec) {}
};
__m128i operator&(sse_vector a, sse_vector b) { return _mm_and_si128(a.v, b.v); }
__m128i operator|(sse_vector a, sse_vector b) { return _mm_or_si128(a.v, b.v); }
__m128i operator^(sse_vector a, sse_vector b) { return _mm_xor_si128(a.v, b.v); }
struct shift16 final { const unsigned bits; shift16() = delete; explicit shift16(unsigned bits) : bits(bits) {}; };
__m128i operator>>(const __m128i a, const shift16 amount) { return _mm_srli_epi16(a, amount.bits); }
uint64_t lower_qword(const __m128i v) { return _mm_cvtsi128_si64(v); }
uint64_t higher_qword(const __m128i v) { return lower_qword(_mm_srli_si128(v, 8)); }
uint64_t simd_sum_epu64(const __m128i v) { return lower_qword(v) + higher_qword(v); }
uint64_t simd_sum_epu64(const __m256i v) { return static_cast<uint64_t>(_mm256_extract_epi64(v, 0)) + static_cast<uint64_t>(_mm256_extract_epi64(v, 1)) + static_cast<uint64_t>(_mm256_extract_epi64(v, 2)) + static_cast<uint64_t>(_mm256_extract_epi64(v, 3)); }
uint64_t simd_sum_epu64(const __m512i v) {
const __m256i lo = _mm512_extracti64x4_epi64(v, 0);
const __m256i hi = _mm512_extracti64x4_epi64(v, 1);
return simd_sum_epu64(lo) + simd_sum_epu64(hi);
}
};
namespace AVX512_harley_seal {
__m512i popcount(const __m512i v) {
const __m512i m1 = _mm512_set1_epi8(0x55);
const __m512i m2 = _mm512_set1_epi8(0x33);
const __m512i m4 = _mm512_set1_epi8(0x0F);
const __m512i t1 = _mm512_sub_epi8(v, (_mm512_srli_epi16(v, 1) & m1));
const __m512i t2 = _mm512_add_epi8(t1 & m2, (_mm512_srli_epi16(t1, 2) & m2));
const __m512i t3 = _mm512_add_epi8(t2, _mm512_srli_epi16(t2, 4)) & m4;
return _mm512_sad_epu8(t3, _mm512_setzero_si512());
}
void CSA(__m512i& h, __m512i& l, __m512i a, __m512i b, __m512i c) {
l = _mm512_ternarylogic_epi32(c, b, a, 0x96);
h = _mm512_ternarylogic_epi32(c, b, a, 0xe8);
}
uint64_t popcnt(const U* data, __m512i mask, const uint64_t size) {
__m512i total = _mm512_setzero_si512();
__m512i ones = _mm512_setzero_si512();
__m512i twos = _mm512_setzero_si512();
__m512i fours = _mm512_setzero_si512();
__m512i eights = _mm512_setzero_si512();
__m512i sixteens = _mm512_setzero_si512();
__m512i twosA, twosB, foursA, foursB, eightsA, eightsB;
const uint64_t limit = size - size % 16;
uint64_t i = 0;
for(; i < limit; i += 16) {
CSA(twosA, ones, ones, data[i+0].m & mask, data[i+1].m & mask);
CSA(twosB, ones, ones, data[i+2].m & mask, data[i+3].m & mask);
CSA(foursA, twos, twos, twosA, twosB);
CSA(twosA, ones, ones, data[i+4].m & mask, data[i+5].m & mask);
CSA(twosB, ones, ones, data[i+6].m & mask, data[i+7].m & mask);
CSA(foursB, twos, twos, twosA, twosB);
CSA(eightsA,fours, fours, foursA, foursB);
CSA(twosA, ones, ones, data[i+8].m & mask, data[i+9].m & mask);
CSA(twosB, ones, ones, data[i+10].m & mask, data[i+11].m & mask);
CSA(foursA, twos, twos, twosA, twosB);
CSA(twosA, ones, ones, data[i+12].m & mask, data[i+13].m & mask);
CSA(twosB, ones, ones, data[i+14].m & mask, data[i+15].m & mask);
CSA(foursB, twos, twos, twosA, twosB);
CSA(eightsB, fours, fours, foursA, foursB);
CSA(sixteens, eights, eights, eightsA, eightsB);
total = _mm512_add_epi64(total, popcount(sixteens));
}
total = _mm512_slli_epi64(total, 4); // * 16
total = _mm512_add_epi64(total, _mm512_slli_epi64(popcount(eights), 3)); // += 8 * ...
total = _mm512_add_epi64(total, _mm512_slli_epi64(popcount(fours), 2)); // += 4 * ...
total = _mm512_add_epi64(total, _mm512_slli_epi64(popcount(twos), 1)); // += 2 * ...
total = _mm512_add_epi64(total, popcount(ones));
for(; i < size; i++)
total = _mm512_add_epi64(total, popcount(data[i].m & mask));
return SimdSum::simd_sum_epu64(total);
}
} // AVX512_harley_seal
void process(int L, int R) {
M u;
int low = (l-L-1+(nblocks<<4))/nblocks-15;
int high = (r-L-1+(nblocks<<4))/nblocks-16;
if(opt == 1 || opt == 5 || opt == 6) u = (high < 0) ? _mm512_set1_epi32(-1) : _mm512_andnot_si512(_mm512_andnot_si512(prefixes[low].m, prefixes[high+1].m), _mm512_set1_epi32(-1));
else u = (high < 0) ? _mm512_setzero_si512() : _mm512_andnot_si512(prefixes[low].m, prefixes[high+1].m);
L++; R++;
if(opt == 1) iter(i, L, R) blocks[i].m &= u;
else if(opt == 2) iter(i, L, R) blocks[i].m |= u;
else if(opt == 3) iter(i, L, R) blocks[i].m |= blocks[i+1].m & u;
else if(opt == 4) reti(i, L, R) blocks[i].m |= blocks[i-1].m & u;
else if(opt == 5) iter(i, L, R) blocks[i].m &= blocks[i+1].m | u;
else if(opt == 6) reti(i, L, R) blocks[i].m &= blocks[i-1].m | u;
else sum += AVX512_harley_seal::popcnt(blocks+L, u, R-L);
}
signed main() {
ios_base::sync_with_stdio(false); cin.tie(0);
int N, Q; gi(N), gi(Q);
nblocks = max((N+511)/512, 1);
prefixes[0].m = prefixes[1].m = _mm512_setzero_si512();
prefixes[1].words[0] |= 1;
iter(i, 2, 513) {
prefixes[i].m = prefixes[i-1].m;
prefixes[i].words[(i-1)>>6] |= 1ull<<((i-1)&63);
}
blocks = (U*)_mm_malloc((nblocks+2)*(sizeof(U)), sizeof(U));
iter(i, 0, N) {
int k; gi(k);
if(k) setbit(blocks[i % nblocks+1], i / nblocks);
}
while(Q--) {
gi(opt), gi(l), gi(r);
l -= (opt != 4 && opt != 6);
r -= (opt == 3 || opt == 5);
if(opt == 4 || opt == 6) blocks[0] = blocks[nblocks], LL(blocks[0]);
else if(opt == 3 || opt == 5) blocks[nblocks+1] = blocks[1], RR(blocks[nblocks+1]);
int v2 = l%nblocks, v3 = r%nblocks;
if(v2 > v3) swap(v2, v3);
sum = 0;
if(opt == 4 || opt == 6) process(v3, nblocks), process(v2, v3), process(0, v2);
else process(0, v2), process(v2, v3), process(v3, nblocks);
if(opt == 7) print(sum), pc('\n');
}
return 0;
}
}
int main() {
int N, Q; gi(N), gi(Q);
io::iT++;
int tmp;
iter(i, 0, N) gi(tmp);
int c7 = 0;
iter(i, 0, Q) { int a, b, c; gi(a), gi(b), gi(c); c7 += (a == 7); }
io::reverse();
if(c7 > N/2) Lin::main();
else Par::main();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment