Skip to content

Instantly share code, notes, and snippets.

View cyb70289's full-sized avatar

Yibo Cai cyb70289

View GitHub Profile
[root@lnd-jm02 iscsi-hardlock]# ./test-lse
Counters: 17171 48463 25182 21011 24232 24657 20559 15806 22230 42319 32158 16342 33570 29741 26467 19637 24702 26923
Counters: 32542 38140 21447 21856 25708 21327 13062 23763 24566 35316 36268 24727 28855 23722 34471 28493 17219 13948
Counters: 39521 29413 29365 26218 24139 10212 34962 16706 6899 30642 54875 30239 29795 22390 13964 29797 14388 21804
Counters: 39066 24223 10731 19933 41360 19116 17735 38524 24549 34395 22350 21955 28910 27429 33503 20678 24180 16701
Counters: 45116 30269 18649 32161 14482 25578 34041 29304 38479 20491 30742 29352 26589 17246 22371 23332 9264 18094
Counters: 35308 41495 16516 28869 27017 19990 22366 28377 18060 13942 29316 25233
@cyb70289
cyb70289 / gist:09438df3f78dafd6f5de7c4876daafe7
Created August 27, 2024 04:29
0002-optimize-varint-with-lookup-table.patch
From 2eee837216e575abd9e48a30c65161a42ad59117 Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Wed, 21 Aug 2024 06:40:48 -0400
Subject: [PATCH 2/2] optimize varint with lookup table
---
src/google/protobuf/io/coded_stream.h | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/src/google/protobuf/io/coded_stream.h b/src/google/protobuf/io/coded_stream.h
@cyb70289
cyb70289 / gist:d947c64cf0f116607140e06b535b95aa
Created August 27, 2024 03:52
0001-add-string-list-and-map-benchmarks.patch
From a5239aaf894334f4b4c331f1d40245f5f64a2cd4 Mon Sep 17 00:00:00 2001
From: Yibo Cai <yibo.cai@arm.com>
Date: Wed, 21 Aug 2024 06:40:29 -0400
Subject: [PATCH 1/2] add string list and map benchmarks
---
benchmarks/benchmark.cc | 65 +++++++++++++++++++++++++++++++++++++
benchmarks/descriptor.proto | 8 +++++
2 files changed, 73 insertions(+)
@cyb70289
cyb70289 / optimize-decode.diff
Created July 18, 2024 03:20
sonic-decode-opt
diff --git a/cmake/set_arch_flags.cmake b/cmake/set_arch_flags.cmake
index 538ddfe..81c40e4 100644
--- a/cmake/set_arch_flags.cmake
+++ b/cmake/set_arch_flags.cmake
@@ -2,8 +2,8 @@ function(set_arch_flags target arch)
message(STATUS "Setting architecture flags for ${arch}")
if(arch MATCHES "x86_64")
target_compile_options(${target} PRIVATE -mavx2 -mpclmul -mbmi -mlzcnt)
- elseif(arch MATCHES "arm")
- target_compile_options(${target} PRIVATE -march=armv8-a)
@cyb70289
cyb70289 / optimize-skipstring.diff
Created July 4, 2024 03:24
optimize skipstring with sve2 match
diff --git a/cmake/set_arch_flags.cmake b/cmake/set_arch_flags.cmake
index 538ddfe..6dc7754 100644
--- a/cmake/set_arch_flags.cmake
+++ b/cmake/set_arch_flags.cmake
@@ -2,8 +2,8 @@ function(set_arch_flags target arch)
message(STATUS "Setting architecture flags for ${arch}")
if(arch MATCHES "x86_64")
target_compile_options(${target} PRIVATE -mavx2 -mpclmul -mbmi -mlzcnt)
- elseif(arch MATCHES "arm")
- target_compile_options(${target} PRIVATE -march=armv8-a)
diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
index 5c7c6d5..dc55c69 100644
--- a/src/cpu/aarch64/jit_uni_reorder.cpp
+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
@@ -2680,6 +2680,55 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
return safe_ptr_assign(*reorder_pd, _pd.release());
}
+#define MY_REORDER
+
##################################################################
# profile.py
##################################################################
import tensorflow as tf
import timeit
import os
n_threads = int(os.getenv('OMP_NUM_THREADS'))
if n_threads < 1 or n_threads > 999:
@cyb70289
cyb70289 / gist:98fab345783f23dde9ddce000aaa0a12
Created June 17, 2024 07:04
bluewhale latency vs. bandwidth
Bluewhale memory latency vs. bandwidth
======================================
max bw = 574*8*32 MB/s = 147GB/s
bw = 0%
-------
$ numactl -m0 -N0 /usr/lib/lmbench/bin/lat_mem_rd -P 1 512 4096
"stride=4096
0.00391 1.429
@cyb70289
cyb70289 / bw-test.cc
Created January 12, 2024 09:14
bw-test.cc
// tested with g++-10.5, probably okay for other versions as
// the code is quite simple, check assembly to make sure
// g++ -std=c++11 -O3 -pthread -static bw-test.cc -o bw-test
// XXX: it costs about half minute to compile this file
#include <cstdlib>
#include <iostream>
#include <thread>
#include <vector>
[mysqld]
skip_log_bin
transaction_isolation = READ-COMMITTED
# total data size is about 12G
# buffer pool size = 16G
innodb_buffer_pool_instances = 8
innodb_buffer_pool_size = 17179869184