Skip to content

Instantly share code, notes, and snippets.

@cipri-tom
Forked from mkfmnn/callback-bench.lua
Last active August 20, 2021 08:23
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cipri-tom/e4f28c2785ff0de30b71 to your computer and use it in GitHub Desktop.
Save cipri-tom/e4f28c2785ff0de30b71 to your computer and use it in GitHub Desktop.
local function printf(s, ...)
io.write(s:format(...))
end
local ffi = require("ffi")
ffi.cdef[[
typedef void (*cb)(void);
void set_v(int n, void (*)(void ));
void set_i(int n, void (*)(int ));
void set_d(int n, void (*)(double));
int get_i(int n, int (*)(void));
double get_d(int n, double (*)(void));
void call_v(void );
void call_i(int );
void call_d(double);
void loop (int n);
]]
local callback = ffi.load("./callback.so")
local timeit = require("timeit")
local v = 0
local function lset_v( ) v = v + 1 end
local function lset_a(a) a = a + 1 end
local function lget ( ) return v*2 end
print("operation ", "reps ", "time(s)", "nsec/call")
local c2l = {
{name='set_v', func=lset_v},
{name='set_i', func=lset_a},
{name='set_d', func=lset_a},
{name='get_i', func=lget },
{name='get_d', func=lget }
}
for _,test in ipairs(c2l) do
local r = timeit(function(n)
callback[test.name](n, test.func)
end)
printf("C into Lua %-12s %s\n", test.name, r)
end
print("Lua into C call(void) ", timeit(function(n)
for i = 1, n do callback.call_v() end
end))
print("Lua into C call(int) ", timeit(function(n)
for i = 1, n do callback.call_i(3) end
end))
print("Lua into C call(double)", timeit(function(n)
for i = 1, n do callback.call_d(3.5) end
end))
print("Lua into Lua ", timeit(function(n)
for i = 1, n do lset_v() end
end))
print("C empty loop ", timeit(function(n)
callback.loop(n)
end))
print("Lua empty loop ", timeit(function(n)
for i = 1, n do end
end))
// gcc -std=c99 -Wall -pedantic -O3 -shared -static-libgcc -fPIC callback.c -o callback.so
// --- CALLS ------------------------------------------------------------------
void call_v(void) {
}
void call_i(int v) {
v += 5;
}
void call_d(double v) {
v += 5.0;
}
void loop(int n) {
for (int i = 0; i < n; i++) {
/* prevent compiler optimizations from skipping loop entirely */
__asm__("");
}
}
// --- SETTERS ----------------------------------------------------------------
void set_v(int n, void (*f)(void)) {
for (int i = 0; i < n; i++) {
f();
}
}
void set_i(int n, void (*f)(int)) {
for (int i = 0; i < n; i++) {
f(i);
}
}
void set_d(int n, void (*f)(double)) {
double a = 3.0;
for (int i = 0; i < n; i++) {
f(a);
}
}
// --- GETTERS ----------------------------------------------------------------
void get_i(int n, int (*f)(void)) {
int v;
for (int i = 0; i < n; i++) {
v = f();
}
(void)v; // avoid 'set-but-unused' warning
}
void get_d(int n, double (*f)(void)) {
double v;
for (int i = 0; i < n; i++) {
v = f();
}
(void)v; // avoid 'set-but-unused' warning
}
// --- PUSH vs PULL -----------------------------------------------------------
typedef double (*getter_fp)(int len, unsigned char mono[len]);
struct Arr {
int size;
double data[];
};
enum constants {MONO_LEN = 5};
unsigned char mono[MONO_LEN] = {1, 2, 3, 4, 5};
// --- --- PUSH style ---------------------------------------------------------
void push_style(struct Arr *a, getter_fp get_multiplier)
{
for (int i = 0; i < a->size; ++i)
a->data[i] *= get_multiplier(MONO_LEN, mono);
}
// --- --- PULL style ---------------------------------------------------------
int get_mono_len()
{
return MONO_LEN;
}
unsigned char*
get_mono(int idx)
{
return mono;
}
-- OBJECTIVE : apply a LUA function to all members of an array
-- -- PUSH style: do it in C side, with callback to the LUA function
-- -- PULL style: do it in Lua side, with calls to C to get necessary info
local ffi = require("ffi")
ffi.cdef[[
typedef double (*getter_fp)(int len, unsigned char mono[]);
struct Arr {
int size;
double data[?];
};
void push_style(struct Arr *a, getter_fp get_multiplier);
int get_mono_len();
unsigned char* get_mono(int idx);
]]
local callback = ffi.load("./callback.so")
local timeit = require("timeit")
local arr_t = ffi.typeof("struct Arr");
-- the callback
local function lget_multiplier(len, mono)
local s = 0
for i=0,len-1 do s = s + mono[i] end
return s * 0.5
end
-- PUSH style -----------------------------------------------------------------
local function push_style(n)
local a = arr_t(n, {n})
for i=0,n-1 do a.data[i] = i end
local cb = ffi.cast("getter_fp", lget_multiplier)
callback.push_style(a, cb)
return a
end
-- PULL style -----------------------------------------------------------------
local function pull_style(n)
local a = arr_t(n, {n})
for i=0,n-1 do a.data[i] = i end
local mono_len, mono = callback.get_mono_len()
for i=0,n-1 do
mono = callback.get_mono(i)
a.data[i] = a.data[i] * lget_multiplier(mono_len, mono)
end
return a
end
-- CHECK ----------------------------------------------------------------------
local push_v, pull_v = push_style(100), pull_style(100)
assert(push_v.size == pull_v.size)
for i=0,push_v.size-1 do
assert(push_v.data[i] == pull_v.data[i])
end
-- BENCH ----------------------------------------------------------------------
print("PUSH style", timeit(push_style))
print("PULL style", timeit(pull_style))
operation reps time(s) nsec/call
C into Lua set_v 10000000 0.498 49.817
C into Lua set_i 10000000 0.662 66.249
C into Lua set_d 10000000 0.681 68.143
C into Lua get_i 10000000 0.633 63.272
C into Lua get_d 10000000 0.650 64.990
Lua into C call(void) 100000000 0.381 3.807
Lua into C call(int) 100000000 0.381 3.815
Lua into C call(double) 100000000 0.415 4.154
Lua into Lua 100000000 0.104 1.039
C empty loop 1000000000 0.695 0.695
Lua empty loop 1000000000 0.693 0.693
PUSH style 1000000 0.158 158.256
PULL style 1000000 0.207 207.297
--- call a function with a repeat count argument.
-- Takes a single argument, which is a function which in turn takes one argument,
-- a repeat count. The function is called with increasingly large repeat counts
-- until it takes at least a certain amount of time to run, and is then called four
-- more times with the same repeat count, with the minimum elapsed time recorded.
-- Modeled loosely on Python's timeit, except the function passed in is responsible
-- for doing the actual repetition.
return function(func)
local reps = 10000
local elapsed
repeat
reps = reps * 10
local start = os.clock()
func(reps)
elapsed = os.clock() - start
until elapsed > 0.1 or reps >= 1e9
for i = 1, 4 do
local start = os.clock()
func(reps)
elapsed = math.min(elapsed, os.clock() - start)
end
return ("%10d\t%.3f\t%7.3f"):format(reps, elapsed, elapsed / reps * 1e9)
end
@cipri-tom
Copy link
Author

Thanks for the detailed analysis and run on very different architectures !
Really nice to see ARM in there too, we can only expect it to get better, indeed.

I had no idea people are still interested in this benchmark. I re-visited the answers to the question and I find that indeed, this is measuring the CPU more than anything else. I really recommend people take Josh's suggestion and benchmark everything in context, since this empty benchmark may not replicate to one's real scenario.

But other than that, we can see that the performance is about the same regardless of the direction (Lua -> C, C->Lua) except on ARM, so I'd first write for readability and ease of use, and then only move code to the other side if it turns out to be a bottleneck.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment