Created
April 28, 2024 21:23
-
-
Save lemire/46f28d364212ac66d5320929ab3a8758 to your computer and use it in GitHub Desktop.
some assembly benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <chrono> | |
#include <cstdint> | |
#include <iostream> | |
// The assembly is potentially unsafe because we read from the stack | |
// without checking. However, it appears to be good enough for our | |
// benchmarking purposes under LLVM. | |
void demo() { | |
uint32_t w2, w3; | |
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0}; | |
size_t counter = 100000; | |
// Inline assembly block | |
auto begin = std::chrono::high_resolution_clock::now(); | |
asm volatile(".align 4\n" | |
"mainloop1:\n" | |
"ldp %w[w2], %w[w3], [%[x0], #8]\n" | |
"ldp %w[w2], %w[w3], [%[x0], #8]\n" | |
"ldp %w[w2], %w[w3], [%[x0], #8]\n" | |
"ldp %w[w2], %w[w3], [%[x0], #8]\n" | |
"subs %[counter],%[counter],#1\n" | |
"bne mainloop1" | |
: [w2] "=r"(w2), [w3] "=r"(w3) | |
: [x0] "r"(buffer), [counter] "r"(counter)); | |
auto end = std::chrono::high_resolution_clock::now(); | |
double nanoseconds = | |
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count(); | |
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl; | |
// Print the loaded values | |
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl; | |
} | |
void demof() { | |
uint32_t w2, w3; | |
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0}; | |
size_t counter = 100000; | |
// Inline assembly block | |
auto begin = std::chrono::high_resolution_clock::now(); | |
int result = 1; // Declare a dummy variable | |
asm volatile( | |
".align 4\n" | |
"mainloopf:\n" | |
"str w8, [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldp %w[w2], %w[w3], [%[x0], #8]\n" | |
"str w8, [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldp %w[w2], %w[w3], [%[x0], #8]\n" | |
"str w8, [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldp %w[w2], %w[w3], [%[x0], #8]\n" | |
"str w8, [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldp %w[w2], %w[w3], [%[x0], #8]\n" | |
"subs %[counter],%[counter],#1\n" | |
"bne mainloopf" | |
: [w2] "=r"(w2), [w3] "=r"(w3), [w8] "=r"(result) | |
: [x0] "r"(buffer), [counter] "r"(counter)); | |
printf("result: %d\n", result); | |
auto end = std::chrono::high_resolution_clock::now(); | |
double nanoseconds = | |
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count(); | |
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl; | |
// Print the loaded values | |
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl; | |
} | |
void demo2() { | |
uint32_t w2, w3; | |
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0}; | |
size_t counter = 100000; | |
auto begin = std::chrono::high_resolution_clock::now(); | |
// Inline assembly block | |
asm volatile(".align 4\n" | |
"mainloop2:\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"subs %[counter],%[counter],#1\n" | |
"bne mainloop2" | |
: [w2] "=r"(w2), [w3] "=r"(w3) | |
: [x0] "r"(buffer), [counter] "r"(counter)); | |
auto end = std::chrono::high_resolution_clock::now(); | |
double nanoseconds = | |
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count(); | |
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl; | |
// Print the loaded values | |
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl; | |
} | |
void demo2f() { | |
uint32_t w2, w3; | |
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0}; | |
size_t counter = 100000; | |
auto begin = std::chrono::high_resolution_clock::now(); | |
// Inline assembly block | |
asm volatile(".align 4\n" | |
"mainloop2f:\n" | |
"str w8, [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"str w8, [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"str w8, [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"str w8, [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"subs %[counter],%[counter],#1\n" | |
"bne mainloop2f" | |
: [w2] "=r"(w2), [w3] "=r"(w3) | |
: [x0] "r"(buffer), [counter] "r"(counter)); | |
auto end = std::chrono::high_resolution_clock::now(); | |
double nanoseconds = | |
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count(); | |
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl; | |
// Print the loaded values | |
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl; | |
} | |
void demo2ff() { | |
uint32_t w2, w3; | |
uint32_t buffer[4] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0}; | |
size_t counter = 100000; | |
auto begin = std::chrono::high_resolution_clock::now(); | |
// Inline assembly block | |
asm volatile(".align 4\n" | |
"mainloop2ff:\n" | |
"str w8, [%[x0], #8]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"str w8, [%[x0], #8]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"str w8, [%[x0], #8]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"str w8, [%[x0], #8]\n" | |
"ldr %w[w2], [%[x0], #8]\n" | |
"str w8, [%[x0], #0x0c]\n" | |
"ldr %w[w3], [%[x0], #0x0C]\n" | |
"subs %[counter],%[counter],#1\n" | |
"bne mainloop2ff" | |
: [w2] "=r"(w2), [w3] "=r"(w3) | |
: [x0] "r"(buffer), [counter] "r"(counter)); | |
auto end = std::chrono::high_resolution_clock::now(); | |
double nanoseconds = | |
std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count(); | |
std::cout << "Time taken: " << nanoseconds << " ns" << std::endl; | |
// Print the loaded values | |
std::cout << std::hex << "w2: " << w2 << ", w3: " << w3 << std::endl; | |
} | |
int main(int argc, char **argv) { | |
int attempt = 10000; | |
if (argc > 1) { | |
attempt = atoi(argv[1]); | |
} | |
if (attempt <= 0) | |
attempt = 1; | |
demo(); | |
demof(); | |
demo2(); | |
demo2f(); | |
demo2ff(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment