Skip to content

Instantly share code, notes, and snippets.

@cipri-tom
Forked from mkfmnn/callback-bench.lua
Last active August 20, 2021 08:23
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cipri-tom/e4f28c2785ff0de30b71 to your computer and use it in GitHub Desktop.
Save cipri-tom/e4f28c2785ff0de30b71 to your computer and use it in GitHub Desktop.
local function printf(s, ...)
io.write(s:format(...))
end
local ffi = require("ffi")
ffi.cdef[[
typedef void (*cb)(void);
void set_v(int n, void (*)(void ));
void set_i(int n, void (*)(int ));
void set_d(int n, void (*)(double));
int get_i(int n, int (*)(void));
double get_d(int n, double (*)(void));
void call_v(void );
void call_i(int );
void call_d(double);
void loop (int n);
]]
local callback = ffi.load("./callback.so")
local timeit = require("timeit")
local v = 0
local function lset_v( ) v = v + 1 end
local function lset_a(a) a = a + 1 end
local function lget ( ) return v*2 end
print("operation ", "reps ", "time(s)", "nsec/call")
local c2l = {
{name='set_v', func=lset_v},
{name='set_i', func=lset_a},
{name='set_d', func=lset_a},
{name='get_i', func=lget },
{name='get_d', func=lget }
}
for _,test in ipairs(c2l) do
local r = timeit(function(n)
callback[test.name](n, test.func)
end)
printf("C into Lua %-12s %s\n", test.name, r)
end
print("Lua into C call(void) ", timeit(function(n)
for i = 1, n do callback.call_v() end
end))
print("Lua into C call(int) ", timeit(function(n)
for i = 1, n do callback.call_i(3) end
end))
print("Lua into C call(double)", timeit(function(n)
for i = 1, n do callback.call_d(3.5) end
end))
print("Lua into Lua ", timeit(function(n)
for i = 1, n do lset_v() end
end))
print("C empty loop ", timeit(function(n)
callback.loop(n)
end))
print("Lua empty loop ", timeit(function(n)
for i = 1, n do end
end))
// gcc -std=c99 -Wall -pedantic -O3 -shared -static-libgcc -fPIC callback.c -o callback.so
// --- CALLS ------------------------------------------------------------------
void call_v(void) {
}
void call_i(int v) {
v += 5;
}
void call_d(double v) {
v += 5.0;
}
void loop(int n) {
for (int i = 0; i < n; i++) {
/* prevent compiler optimizations from skipping loop entirely */
__asm__("");
}
}
// --- SETTERS ----------------------------------------------------------------
void set_v(int n, void (*f)(void)) {
for (int i = 0; i < n; i++) {
f();
}
}
void set_i(int n, void (*f)(int)) {
for (int i = 0; i < n; i++) {
f(i);
}
}
void set_d(int n, void (*f)(double)) {
double a = 3.0;
for (int i = 0; i < n; i++) {
f(a);
}
}
// --- GETTERS ----------------------------------------------------------------
void get_i(int n, int (*f)(void)) {
int v;
for (int i = 0; i < n; i++) {
v = f();
}
(void)v; // avoid 'set-but-unused' warning
}
void get_d(int n, double (*f)(void)) {
double v;
for (int i = 0; i < n; i++) {
v = f();
}
(void)v; // avoid 'set-but-unused' warning
}
// --- PUSH vs PULL -----------------------------------------------------------
typedef double (*getter_fp)(int len, unsigned char mono[len]);
struct Arr {
int size;
double data[];
};
enum constants {MONO_LEN = 5};
unsigned char mono[MONO_LEN] = {1, 2, 3, 4, 5};
// --- --- PUSH style ---------------------------------------------------------
void push_style(struct Arr *a, getter_fp get_multiplier)
{
for (int i = 0; i < a->size; ++i)
a->data[i] *= get_multiplier(MONO_LEN, mono);
}
// --- --- PULL style ---------------------------------------------------------
int get_mono_len()
{
return MONO_LEN;
}
unsigned char*
get_mono(int idx)
{
return mono;
}
-- OBJECTIVE : apply a LUA function to all members of an array
-- -- PUSH style: do it in C side, with callback to the LUA function
-- -- PULL style: do it in Lua side, with calls to C to get necessary info
local ffi = require("ffi")
ffi.cdef[[
typedef double (*getter_fp)(int len, unsigned char mono[]);
struct Arr {
int size;
double data[?];
};
void push_style(struct Arr *a, getter_fp get_multiplier);
int get_mono_len();
unsigned char* get_mono(int idx);
]]
local callback = ffi.load("./callback.so")
local timeit = require("timeit")
local arr_t = ffi.typeof("struct Arr");
-- the callback
local function lget_multiplier(len, mono)
local s = 0
for i=0,len-1 do s = s + mono[i] end
return s * 0.5
end
-- PUSH style -----------------------------------------------------------------
local function push_style(n)
local a = arr_t(n, {n})
for i=0,n-1 do a.data[i] = i end
local cb = ffi.cast("getter_fp", lget_multiplier)
callback.push_style(a, cb)
return a
end
-- PULL style -----------------------------------------------------------------
local function pull_style(n)
local a = arr_t(n, {n})
for i=0,n-1 do a.data[i] = i end
local mono_len, mono = callback.get_mono_len()
for i=0,n-1 do
mono = callback.get_mono(i)
a.data[i] = a.data[i] * lget_multiplier(mono_len, mono)
end
return a
end
-- CHECK ----------------------------------------------------------------------
local push_v, pull_v = push_style(100), pull_style(100)
assert(push_v.size == pull_v.size)
for i=0,push_v.size-1 do
assert(push_v.data[i] == pull_v.data[i])
end
-- BENCH ----------------------------------------------------------------------
print("PUSH style", timeit(push_style))
print("PULL style", timeit(pull_style))
--- call a function with a repeat count argument.
-- Takes a single argument, which is a function which in turn takes one argument,
-- a repeat count. The function is called with increasingly large repeat counts
-- until it takes at least a certain amount of time to run, and is then called four
-- more times with the same repeat count, with the minimum elapsed time recorded.
-- Modeled loosely on Python's timeit, except the function passed in is responsible
-- for doing the actual repetition.
return function(func)
local reps = 10000
local elapsed
repeat
reps = reps * 10
local start = os.clock()
func(reps)
elapsed = os.clock() - start
until elapsed > 0.1 or reps >= 1e9
for i = 1, 4 do
local start = os.clock()
func(reps)
elapsed = math.min(elapsed, os.clock() - start)
end
return ("%10d\t%.3f\t%7.3f"):format(reps, elapsed, elapsed / reps * 1e9)
end
@exikyut
Copy link

exikyut commented Jul 24, 2021

I happened to be curious about the real-world difference between push vs pull approaches - how much of a performance impact there is, but also what sort of timescale the slowdown is occurring at (which is arguably more important for orientation).

A quick google found https://stackoverflow.com/questions/12329128/luajit-ffi-callback-performance/12435278#12435278, an answer that referenced a source Gist with benchmark code. Unsure how to run it (-.-) I noticed it had a few forks - then stumbled on this interesting-looking slightly extended version. (Edit: Yup, I didn't scroll slightly further down to find the additional answer directly underneath the one I found, pointing straight here...)

After figuring out that, y'know, you just run the two scripts ;) (but thanks very much for adding the gcc invocation, I might not have figured that out myself before giving up), I got sliiiightly sidetracked wondering what sort of performance differences might exist in various compute environments.

My own hardware is getting pretty old; it would be very interesting to see how much faster the latest chipsets perform, and whether they're significantly faster (eg, by 1-10%) or only slightly faster.

Without reference points I'm not sure if the free-tier public cloud resources I've tested against just happen to closely correlate with my own systems' performance, or whether things are broadly neck-and-neck.

It was particularly interesting to discover what hardware I was running on in some environments! EPYC's getting around... (duh)

All results below are from 2.1.0-beta3. I wondered about also comparing with Git HEAD, and while that does sound interesting, I didn't want to make this twice as long...


i3-3220 (3.3GHz) "server" w/ 1600MHz DDR3:

operation               reps            time(s) nsec/call
C into Lua set_v          10000000      0.379    37.903
C into Lua set_i          10000000      0.490    48.960
C into Lua set_d          10000000      0.488    48.804
C into Lua get_i          10000000      0.489    48.913
C into Lua get_d          10000000      0.475    47.489
Lua into C call(void)    100000000      0.182     1.819
Lua into C call(int)     100000000      0.182     1.819
Lua into C call(double)  100000000      0.182     1.819
Lua into Lua            1000000000      0.909     0.909
C empty loop            1000000000      0.303     0.303
Lua empty loop          1000000000      0.303     0.303

PUSH style         1000000      0.112   112.246
PULL style         1000000      0.149   148.537

EliteBook 8470p (i5-3360M (2.8GHz), 1600MHz DDR3) + Chrome chewing ~12% CPU 🔥:

operation               reps            time(s) nsec/call
C into Lua set_v          10000000      0.370    37.001
C into Lua set_i          10000000      0.477    47.652
C into Lua set_d          10000000      0.474    47.395
C into Lua get_i          10000000      0.475    47.507
C into Lua get_d          10000000      0.463    46.325
Lua into C call(void)    100000000      0.174     1.745
Lua into C call(int)     100000000      0.174     1.744
Lua into C call(double)  100000000      0.175     1.745
Lua into Lua            1000000000      0.872     0.872
C empty loop            1000000000      0.293     0.293
Lua empty loop          1000000000      0.293     0.293

PUSH style         1000000      0.112   112.162
PULL style         1000000      0.156   156.177

Google Cloud shell session (4 cores, EPYC 7B12):

operation               reps            time(s) nsec/call
C into Lua set_v          10000000      0.383    38.350
C into Lua set_i          10000000      0.509    50.927
C into Lua set_d          10000000      0.503    50.275
C into Lua get_i          10000000      0.524    52.366
C into Lua get_d          10000000      0.522    52.226
Lua into C call(void)    100000000      0.189     1.892
Lua into C call(int)     100000000      0.189     1.888
Lua into C call(double)  100000000      0.189     1.890
Lua into Lua             100000000      0.151     1.508
C empty loop            1000000000      0.381     0.381
Lua empty loop          1000000000      0.377     0.377

PUSH style         1000000      0.131   131.022
PULL style         1000000      0.180   179.628

Google Cloud free tier f1-micro (1 core, "Xeon(R) CPU @ 2.20GHz"):

operation               reps            time(s) nsec/call
C into Lua set_v          10000000      0.377    37.709
C into Lua set_i          10000000      0.522    52.171
C into Lua set_d          10000000      0.529    52.860
C into Lua get_i          10000000      0.510    50.979
C into Lua get_d          10000000      0.505    50.461
Lua into C call(void)    100000000      0.186     1.858
Lua into C call(int)     100000000      0.186     1.864
Lua into C call(double)  100000000      0.186     1.865
Lua into Lua             100000000      0.149     1.493
C empty loop            1000000000      0.373     0.373
Lua empty loop          1000000000      0.373     0.373

PUSH style         1000000      0.148   147.673
PULL style         1000000      0.197   197.097

AWS free tier t2-micro (1 core, E5-2676 v3 @ 2.4GHz):

operation               reps            time(s) nsec/call
C into Lua set_v          10000000      0.439    43.869
C into Lua set_i          10000000      0.575    57.519
C into Lua set_d          10000000      0.572    57.180
C into Lua get_i          10000000      0.566    56.595
C into Lua get_d          10000000      0.552    55.163
Lua into C call(void)    100000000      0.187     1.870
Lua into C call(int)     100000000      0.187     1.869
Lua into C call(double)  100000000      0.224     2.238
Lua into Lua             100000000      0.112     1.118
C empty loop            1000000000      0.381     0.381
Lua empty loop          1000000000      0.377     0.377

PUSH style         1000000      0.134   134.483
PULL style         1000000      0.187   187.290

Contabo VPS (2 cores, E5-2620 v3 @ 2.4GHz) + some (I/O-throttled) background processing 💽:

operation               reps            time(s) nsec/call
C into Lua set_v          10000000      0.478    47.840
C into Lua set_i          10000000      0.647    64.731
C into Lua set_d          10000000      0.603    60.268
C into Lua get_i          10000000      0.628    62.763
C into Lua get_d          10000000      0.622    62.220
Lua into C call(void)    100000000      0.205     2.045
Lua into C call(int)     100000000      0.204     2.036
Lua into C call(double)  100000000      0.200     1.998
Lua into Lua             100000000      0.119     1.190
C empty loop            1000000000      0.414     0.414
Lua empty loop          1000000000      0.393     0.393

PUSH style         1000000      0.174   173.705
PULL style         1000000      0.236   235.650

Oracle Cloud free tier, VM.Standard.E2.1.Micro (2 cores, EPYC 7551):

operation               reps            time(s) nsec/call
C into Lua set_v          10000000      0.418    41.831
C into Lua set_i          10000000      0.602    60.198
C into Lua set_d          10000000      0.594    59.361
C into Lua get_i          10000000      0.518    51.774
C into Lua get_d          10000000      0.527    52.681
Lua into C call(void)    100000000      0.200     2.002
Lua into C call(int)     100000000      0.190     1.904
Lua into C call(double)  100000000      0.200     2.001
Lua into Lua             100000000      0.127     1.268
C empty loop            1000000000      0.400     0.400
Lua empty loop          1000000000      0.401     0.401

PUSH style         1000000      0.141   140.702
PULL style         1000000      0.192   191.596

Oracle Cloud free tier, VM.Standard.A1.Flex (4/4 cores enabled, Ampere Neoverse N1):

operation               reps            time(s) nsec/call
C into Lua set_v          10000000      0.390    39.000
C into Lua set_i          10000000      0.510    51.000
C into Lua set_d          10000000      0.500    50.000
C into Lua get_i          10000000      0.510    51.000
C into Lua get_d          10000000      0.490    49.000
Lua into C call(void)     10000000      0.730    73.000
Lua into C call(int)      10000000      0.940    94.000
Lua into C call(double)   10000000      0.930    93.000
Lua into Lua            1000000000      0.660     0.660
C empty loop            1000000000      0.370     0.370
Lua empty loop          1000000000      0.370     0.370

PUSH style        10000000      1.030   103.000
PULL style         1000000      0.420   420.000

Of note is that LuaJIT's AArch64 JIT (as of 2.1.0) doesn't optimize Lua-into-C yet. (Maybe Git HEAD does?)


An extra tidbit: I had to download and build LuaJIT from source on a couple of environments to get the same version everywhere. It was a bit of a small-pause moment to realize how fast the ARM system was (you did read the last entry above, right? 😺) in comparison to the rest of the systems.

Using make -j32, I saw:

Contabo (2 cores):

real    0m55.110s
user    1m30.614s
sys     0m9.229s

AWS t2-micro (1 core):

real    0m32.216s
user    0m30.347s
sys     0m1.758s

Google Cloud shell session (4 cores):

real    0m14.639s
user    0m45.821s
sys     0m4.245s

Oracle Cloud VM.Standard.A1.Flex (4 cores):

real    0m5.815s
user    0m20.859s
sys     0m0.605s

It's been very interesting to realize what sorts of resources are being made available for free (with the Ampere offering).

It's very reasonable to posit that the current situation only exists to bootstrap developers' interest, and may change once there is sufficient mindshare (and tenancy!) saturation - but if this is the sort of performance on offer, once that mindshare has been established, it'll rapidly become entrenched and demand will only increase.

Scaling 4 cores and 24GB RAM out for free is actually kind of interesting - that's significant enough that, if the pricing structure *were* to change (c'est la vie Amazon Cloud Drive), other provider(s) would likely be able to step up and maintain this new status quo, because of the favorable economies of scale (the Ampere compute shape scales out to 80 cores and 512GB RAM).

This has been unexpectedly educational.

@cipri-tom
Copy link
Author

Thanks for the detailed analysis and run on very different architectures !
Really nice to see ARM in there too, we can only expect it to get better, indeed.

I had no idea people are still interested in this benchmark. I re-visited the answers to the question and I find that indeed, this is measuring the CPU more than anything else. I really recommend people take Josh's suggestion and benchmark everything in context, since this empty benchmark may not replicate to one's real scenario.

But other than that, we can see that the performance is about the same regardless of the direction (Lua -> C, C->Lua) except on ARM, so I'd first write for readability and ease of use, and then only move code to the other side if it turns out to be a bottleneck.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment