Skip to content

Instantly share code, notes, and snippets.

@lemire
Created April 28, 2024 21:23
Show Gist options
  • Save lemire/46f28d364212ac66d5320929ab3a8758 to your computer and use it in GitHub Desktop.
Save lemire/46f28d364212ac66d5320929ab3a8758 to your computer and use it in GitHub Desktop.
some assembly benchmark
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <iostream>
// The assembly is potentially unsafe because we read from the stack
// without checking. However, it appears to be good enough for our
// benchmarking purposes under LLVM.
void demo() {
uint32_t w2, w3;
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0};
size_t counter = 100000;
// Inline assembly block
auto begin = std::chrono::high_resolution_clock::now();
asm volatile(".align 4\n"
"mainloop1:\n"
"ldp %w[w2], %w[w3], [%[x0], #8]\n"
"ldp %w[w2], %w[w3], [%[x0], #8]\n"
"ldp %w[w2], %w[w3], [%[x0], #8]\n"
"ldp %w[w2], %w[w3], [%[x0], #8]\n"
"subs %[counter],%[counter],#1\n"
"bne mainloop1"
: [w2] "=r"(w2), [w3] "=r"(w3)
: [x0] "r"(buffer), [counter] "r"(counter));
auto end = std::chrono::high_resolution_clock::now();
double nanoseconds =
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl;
// Print the loaded values
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl;
}
void demof() {
uint32_t w2, w3;
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0};
size_t counter = 100000;
// Inline assembly block
auto begin = std::chrono::high_resolution_clock::now();
int result = 1; // Declare a dummy variable
asm volatile(
".align 4\n"
"mainloopf:\n"
"str w8, [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldp %w[w2], %w[w3], [%[x0], #8]\n"
"str w8, [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldp %w[w2], %w[w3], [%[x0], #8]\n"
"str w8, [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldp %w[w2], %w[w3], [%[x0], #8]\n"
"str w8, [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldp %w[w2], %w[w3], [%[x0], #8]\n"
"subs %[counter],%[counter],#1\n"
"bne mainloopf"
: [w2] "=r"(w2), [w3] "=r"(w3), [w8] "=r"(result)
: [x0] "r"(buffer), [counter] "r"(counter));
printf("result: %d\n", result);
auto end = std::chrono::high_resolution_clock::now();
double nanoseconds =
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl;
// Print the loaded values
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl;
}
void demo2() {
uint32_t w2, w3;
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0};
size_t counter = 100000;
auto begin = std::chrono::high_resolution_clock::now();
// Inline assembly block
asm volatile(".align 4\n"
"mainloop2:\n"
"ldr %w[w2], [%[x0], #8]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"ldr %w[w2], [%[x0], #8]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"ldr %w[w2], [%[x0], #8]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"ldr %w[w2], [%[x0], #8]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"subs %[counter],%[counter],#1\n"
"bne mainloop2"
: [w2] "=r"(w2), [w3] "=r"(w3)
: [x0] "r"(buffer), [counter] "r"(counter));
auto end = std::chrono::high_resolution_clock::now();
double nanoseconds =
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl;
// Print the loaded values
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl;
}
void demo2f() {
uint32_t w2, w3;
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0};
size_t counter = 100000;
auto begin = std::chrono::high_resolution_clock::now();
// Inline assembly block
asm volatile(".align 4\n"
"mainloop2f:\n"
"str w8, [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldr %w[w2], [%[x0], #8]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"str w8, [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldr %w[w2], [%[x0], #8]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"str w8, [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldr %w[w2], [%[x0], #8]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"str w8, [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldr %w[w2], [%[x0], #8]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"subs %[counter],%[counter],#1\n"
"bne mainloop2f"
: [w2] "=r"(w2), [w3] "=r"(w3)
: [x0] "r"(buffer), [counter] "r"(counter));
auto end = std::chrono::high_resolution_clock::now();
double nanoseconds =
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl;
// Print the loaded values
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl;
}
void demo2ff() {
uint32_t w2, w3;
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0};
size_t counter = 100000;
auto begin = std::chrono::high_resolution_clock::now();
// Inline assembly block
asm volatile(".align 4\n"
"mainloop2ff:\n"
"str w8, [%[x0], #8]\n"
"ldr %w[w2], [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"str w8, [%[x0], #8]\n"
"ldr %w[w2], [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"str w8, [%[x0], #8]\n"
"ldr %w[w2], [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"str w8, [%[x0], #8]\n"
"ldr %w[w2], [%[x0], #8]\n"
"str w8, [%[x0], #0x0c]\n"
"ldr %w[w3], [%[x0], #0x0C]\n"
"subs %[counter],%[counter],#1\n"
"bne mainloop2ff"
: [w2] "=r"(w2), [w3] "=r"(w3)
: [x0] "r"(buffer), [counter] "r"(counter));
auto end = std::chrono::high_resolution_clock::now();
double nanoseconds =
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl;
// Print the loaded values
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl;
}
int main(int argc, char **argv) {
int attempt = 10000;
if (argc > 1) {
attempt = atoi(argv[1]);
}
if (attempt <= 0)
attempt = 1;
demo();
demof();
demo2();
demo2f();
demo2ff();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment