Skip to content

Instantly share code, notes, and snippets.

SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) {
const uint8_t*& ip = *ip_p;
// This section is crucial for the throughput of the decompression loop.
// The latency of an iteration is fundamentally constrained by the
// following data chain on ip.
// ip -> c = Load(ip) -> ip1 = ip + 1 + (c & 3) -> ip = ip1 or ip2
// ip2 = ip + 2 + (c >> 2)
// This amounts to 8 cycles.
// 5 (load) + 1 (c & 3) + 1 (lea ip1, [ip + (c & 3) + 1]) + 1 (cmov)
@danlark1
danlark1 / perf.data
Last active May 29, 2021 20:22
perf_data_xeon
Performance counter stats for '/usr/local/home/danilak/zstd_clean/build/cmake/build_clean_gcc_8/programs/zstd -t /usr/local/home/danilak/benchmark_zstd/arc.0.tar.zst' (20 runs):
207.16 msec task-clock # 0.996 CPUs utilized ( +- 0.25% )
<not supported> cycles
Performance counter stats for '/usr/local/home/danilak/zstd/build/cmake/build_optimized_gcc_8/programs/zstd -t /usr/local/home/danilak/benchmark_zstd/arc.0.tar.zst' (20 runs):
209.24 msec task-clock # 0.981 CPUs utilized ( +- 0.25% )
<not supported> cycles
Performance counter stats for '/usr/local/home/danilak/zstd_clean/build/cmake/build_clean_gcc_9/programs/zstd -t /usr/local/home/danilak/benchmark_zstd/arc.0.tar.zst' (20 runs):
207.00 msec task-clock # 0.996 CPUs utilized ( +- 0.35% )
<not supported>
@danlark1
danlark1 / data.perf
Created May 29, 2021 19:38
perf_data_laptop
Performance counter stats for '/home/danilak/zstd_clean/build/cmake/build_clean_gcc_8/programs/zstd -t /home/danilak/benchmark_zstd/data/arc.0.tar.zst' (20 runs):
150.15 msec task-clock # 0.997 CPUs utilized ( +- 0.24% )
627,747,357 cycles # 4.181 GHz ( +- 0.09% )
Performance counter stats for '/home/danilak/zstd/build/cmake/build_optimized_gcc_8/programs/zstd -t /home/danilak/benchmark_zstd/data/arc.0.tar.zst' (20 runs):
149.15 msec task-clock # 0.997 CPUs utilized ( +- 0.03% )
624,933,425 cycles # 4.190 GHz ( +- 0.03% )
Performance counter stats for '/home/danilak/zstd_clean/build/cmake/build_clean_gcc_9/programs/zstd -t /home/danilak/benchmark_zstd/data/arc.0.tar.zst' (20 runs):
148.60 msec task-clock # 0.998 CPUs utilized ( +- 0.08% )
622,635,362 cycles
silesia -1 Ratio LL: 15.4618
silesia -1 Ratio ML: 17.7421
silesia -1 Ratio OF: 3.23671
silesia -1 Ratio LLD: 6.98345
silesia 0 Ratio LL: 40.1268
silesia 0 Ratio ML: 21.6195
silesia 0 Ratio OF: 5.39841
silesia 0 Ratio LLD: 4.05028
silesia 1 Ratio LL: 11.7427
silesia 1 Ratio ML: 15.1667
@danlark1
danlark1 / clang.obj
Created May 29, 2021 17:51
clang_objdump
zstd_clang.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <ZSTD_getcBlockSize>:
/*! ZSTD_getcBlockSize() :
* Provides the size of compressed block from block header `src` */
@danlark1
danlark1 / gcc.obj
Created May 29, 2021 17:49
gcc_objdump
zstd_gcc.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <ZSTD_safecopy>:
* @param ovtype controls the overlap detection
* - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
* - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
struct Median3Killer {
static std::vector<uint32_t> Gen(size_t size) {
size_t k = size / 2;
std::vector<uint32_t> v;
v.reserve(size);
for (size_t i = 1; i < k + 1; ++i) {
if (i & 1) {
v.push_back(i);
} else {
v.push_back(k + i - 1);
// Implicitly "inline"d
constexpr int foo() {
//...
}
// Implicitly "inline"d
template <class T>
int bar() {
//...
}
// inline.h
// With inline everything compiles, without it is a linkage error failure
/* inline */ int f(int x) {
return x + 42;
}
// TU1.cpp
#include "inline.h"
syntax = "proto3";
package tgnews_index;
message Doc {
bytes title = 1;
bytes url = 2;
string lang = 3;
string category = 4;
repeated float embed = 5;