Skip to content

Instantly share code, notes, and snippets.

@LiosK
Last active August 20, 2023 23:53
Show Gist options
  • Save LiosK/6abcd792a159e3a796ca3a52054444e2 to your computer and use it in GitHub Desktop.
Save LiosK/6abcd792a159e3a796ca3a52054444e2 to your computer and use it in GitHub Desktop.
A quick benchmark of / and >> in computing millisecond timestamp
#include <stdint.h>
#include <stdio.h>
#include <time.h>
static const long REPEAT = 4;
static const long NUMBER = 400000000;
/// Computes a millisecond timestamp from `struct timespect` components.
#define div(tv_sec, tv_nsec) \
((uint64_t)(tv_sec)*1000 + (uint64_t)(tv_nsec) / 1000000)
/// Same as `div()`, but uses multiplication and shift operations.
///
/// @see https://dl.acm.org/doi/10.1145/773473.178249
#define shr(tv_sec, tv_nsec) \
((uint64_t)(tv_sec)*1000 + ((uint64_t)(tv_nsec)*UINT64_C(2251799814) >> 51))
volatile struct timespec tp;
volatile uint64_t discarded;
static inline double timespec_diff(const struct timespec *end,
const struct timespec *begin) {
return (double)(end->tv_sec - begin->tv_sec) +
(double)(end->tv_nsec - begin->tv_nsec) / 1000000000.0;
}
int main(void) {
clock_gettime(CLOCK_REALTIME, (struct timespec *)&tp);
printf("tv_sec: %ld, tv_nsec: %ld\n", tp.tv_sec, tp.tv_nsec);
printf("=> div: %llu msec\n", div(tp.tv_sec, tp.tv_nsec));
printf("=> shr: %llu msec\n", shr(tp.tv_sec, tp.tv_nsec));
struct {
double div;
double shr;
} results = {0.0, 0.0};
printf("\nExecuting %ld ops x %ld...\n", NUMBER, REPEAT);
for (long i = 0; i < REPEAT; i++) {
struct timespec begin, end;
clock_gettime(CLOCK_MONOTONIC, &begin);
for (long j = 0; j < NUMBER; j++) {
discarded = div(tp.tv_sec, tp.tv_nsec);
}
clock_gettime(CLOCK_MONOTONIC, &end);
results.div += timespec_diff(&end, &begin);
clock_gettime(CLOCK_MONOTONIC, &begin);
for (long j = 0; j < NUMBER; j++) {
discarded = shr(tp.tv_sec, tp.tv_nsec);
}
clock_gettime(CLOCK_MONOTONIC, &end);
results.shr += timespec_diff(&end, &begin);
}
printf("div: %.3f sec (%.3f nsec/op)\n", results.div / REPEAT,
results.div / REPEAT / NUMBER * 1000000000);
printf("shr: %.3f sec (%.3f nsec/op)\n", results.shr / REPEAT,
results.shr / REPEAT / NUMBER * 1000000000);
return 0;
}
.PHONY: run clean
run: divshr.O0 divshr.O2
./divshr.O0
./divshr.O2
clean:
$(RM) divshr.O*
divshr.O0: divshr.c
$(CC) -Wall -Wextra -O0 -o$@ $<
divshr.O2: divshr.c
$(CC) -Wall -Wextra -O2 -o$@ $<
cc -Wall -Wextra -O0 -odivshr.O0 divshr.c
cc -Wall -Wextra -O2 -odivshr.O2 divshr.c
./divshr.O0
tv_sec: 1692460446, tv_nsec: 656331000
=> div: 1692460446656 msec
=> shr: 1692460446656 msec
Executing 400000000 ops x 4...
div: 3.181 sec (7.952 nsec/op)
shr: 0.693 sec (1.734 nsec/op)
./divshr.O2
tv_sec: 1692460462, tv_nsec: 205994000
=> div: 1692460462205 msec
=> shr: 1692460462205 msec
Executing 400000000 ops x 4...
div: 0.230 sec (0.575 nsec/op)
shr: 0.230 sec (0.574 nsec/op)
@LiosK
Copy link
Author

LiosK commented Aug 20, 2023

That result is to some extent consistent with what I get from assemblies. O2 shr code emits fewer instructions than does div, which might result in the throughput difference. As for O0, perhaps division is much faster on ARM than on x86.

Anyway, I think those results support my opinion that we don't really need to be worried about divisions.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment