var totalSize = sizeof(PForHeader); int i = 0; var entriesAsInt = (uint*)_entriesOutput; var entriesIn = _entries; var prev = Vector256.Create(*entriesIn); var max = Vector256.Create<long>(uint.MaxValue); for (; i + 256 <= _count; i += 256) { var blockStart = entriesAsInt; int j = 0; for (; j < 256; j += Vector256<long>.Count) { var cur = Vector256.Load(entriesIn + i + j); var mixed = Vector256.Shuffle(cur, Vector256.Create(0, 0, 1, 2)) & Vector256.Create(0, -1, -1, -1) | Vector256.Shuffle(prev, Vector256.Create(3, 3, 3, 3)) & Vector256.Create(-1, 0, 0, 0); prev = cur; var delta = cur - mixed; if (Vector256.GreaterThanAny(delta, max)) { HandleDeltaGreaterThanMax(j, delta); } var deltaInts = Vector256.Shuffle(delta.AsUInt32(), Vector256.Create(0u,2,4,6,0,0,0,0)); deltaInts.Store(entriesAsInt); // we write 8 values, but increment by 4, so we'll overwrite it next op entriesAsInt += Vector256<long>.Count; } totalSize += ProcessBlock(blockStart); }