Skip to content

Instantly share code, notes, and snippets.

// Installation.
// Abseil, compile with -labseil
// Libdivide as header.
// GMP as header, compile with -lgmp
// Change the private constructor of absl::uin128 to public in order to use public inheritance.
// GMOCK, GUNIT, compile it.
#include "absl/numeric/int128.h"
#include <random>
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) {
const uint8_t*& ip = *ip_p;
// This section is crucial for the throughput of the decompression loop.
// The latency of an iteration is fundamentally constrained by the
// following data chain on ip.
// ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2
// ip2 = ip + 2 + (c >> 2)
// This amounts to 8 cycles.
// 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)
#include <cinttypes>
#include <atomic>
#include <memory>
#include <type_traits>
//start 00:20
// genius idea https://en.wikipedia.org/wiki/Tagged_pointer
template <class T>
#include "int_lib.h"
/*
typedef int si_int;
typedef unsigned su_int;
typedef long long di_int;
typedef unsigned long long du_int;
typedef int ti_int __attribute__((mode(TI))); // 128 signed
@danlark1
danlark1 / perf.data
Last active May 29, 2021 20:22
perf_data_xeon
Performance counter stats for '/usr/local/home/danilak/zstd_clean/build/cmake/build_clean_gcc_8/programs/zstd -t /usr/local/home/danilak/benchmark_zstd/arc.0.tar.zst' (20 runs):
207.16 msec task-clock # 0.996 CPUs utilized ( +- 0.25% )
<not supported> cycles
Performance counter stats for '/usr/local/home/danilak/zstd/build/cmake/build_optimized_gcc_8/programs/zstd -t /usr/local/home/danilak/benchmark_zstd/arc.0.tar.zst' (20 runs):
209.24 msec task-clock # 0.981 CPUs utilized ( +- 0.25% )
<not supported> cycles
Performance counter stats for '/usr/local/home/danilak/zstd_clean/build/cmake/build_clean_gcc_9/programs/zstd -t /usr/local/home/danilak/benchmark_zstd/arc.0.tar.zst' (20 runs):
207.00 msec task-clock # 0.996 CPUs utilized ( +- 0.35% )
<not supported>
@danlark1
danlark1 / data.perf
Created May 29, 2021 19:38
perf_data_laptop
Performance counter stats for '/home/danilak/zstd_clean/build/cmake/build_clean_gcc_8/programs/zstd -t /home/danilak/benchmark_zstd/data/arc.0.tar.zst' (20 runs):
150.15 msec task-clock # 0.997 CPUs utilized ( +- 0.24% )
627,747,357 cycles # 4.181 GHz ( +- 0.09% )
Performance counter stats for '/home/danilak/zstd/build/cmake/build_optimized_gcc_8/programs/zstd -t /home/danilak/benchmark_zstd/data/arc.0.tar.zst' (20 runs):
149.15 msec task-clock # 0.997 CPUs utilized ( +- 0.03% )
624,933,425 cycles # 4.190 GHz ( +- 0.03% )
Performance counter stats for '/home/danilak/zstd_clean/build/cmake/build_clean_gcc_9/programs/zstd -t /home/danilak/benchmark_zstd/data/arc.0.tar.zst' (20 runs):
148.60 msec task-clock # 0.998 CPUs utilized ( +- 0.08% )
622,635,362 cycles
silesia -1 Ratio LL: 15.4618
silesia -1 Ratio ML: 17.7421
silesia -1 Ratio OF: 3.23671
silesia -1 Ratio LLD: 6.98345
silesia 0 Ratio LL: 40.1268
silesia 0 Ratio ML: 21.6195
silesia 0 Ratio OF: 5.39841
silesia 0 Ratio LLD: 4.05028
silesia 1 Ratio LL: 11.7427
silesia 1 Ratio ML: 15.1667
@danlark1
danlark1 / clang.obj
Created May 29, 2021 17:51
clang_objdump
zstd_clang.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <ZSTD_getcBlockSize>:
/*! ZSTD_getcBlockSize() :
* Provides the size of compressed block from block header `src` */
@danlark1
danlark1 / gcc.obj
Created May 29, 2021 17:49
gcc_objdump
zstd_gcc.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <ZSTD_safecopy>:
* @param ovtype controls the overlap detection
* - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
* - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
struct Median3Killer {
static std::vector<uint32_t> Gen(size_t size) {
size_t k = size / 2;
std::vector<uint32_t> v;
v.reserve(size);
for (size_t i = 1; i < k + 1; ++i) {
if (i & 1) {
v.push_back(i);
} else {
v.push_back(k + i - 1);