Skip to content

Instantly share code, notes, and snippets.

@zeux
Last active June 9, 2023 07:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zeux/37f8f9dd7840ac0217ee862ba3a269fd to your computer and use it in GitHub Desktop.
Save zeux/37f8f9dd7840ac0217ee862ba3a269fd to your computer and use it in GitHub Desktop.
On Nature paper about sorting algorithms. Thread for context: https://mastodon.gamedev.place/@zeux/110510029570470184.
/*
The Nature paper about sorting algorithms has an "improvement" for sort3 that saves a mov.
Thread for context: https://mastodon.gamedev.place/@zeux/110510029570470184
This code is experimentally verifying that the proposed optimization is perf neutral
(aka is not improving performance). You'll need to remove the mov from all 3 versions
and retest; feel free to test one version at a time.
Cycle count established by using 'perf stat' on Ryzen 7 5900X - it does not depend on
whether the mov is there.
Register layout for all three versions (using volatile registers to avoid save/restore):
P %eax
Q %esi
R %ecx
S %edx
Note: all routines are called "sort3" but these are implementing the last 2 stages of a
3-stage 3-element sorting network. This matches "figure 3" from the paper.
*/
__attribute__((noinline))
__attribute__((naked))
void sort3(int* data)
{
__asm__(
"movl 0(%rdi), %eax\n"
"movl 4(%rdi), %esi\n"
"movl 8(%rdi), %ecx\n"
"movl %ecx, %edx\n"
"cmpl %ecx, %eax\n"
"cmovg %eax, %ecx\n"
"cmovl %eax, %edx\n"
"movl %edx, %eax\n" // remove this line to discover the secret
"cmpl %esi, %edx\n"
"cmovg %esi, %eax\n"
"cmovg %edx, %esi\n"
"movl %eax, 0(%rdi)\n"
"movl %esi, 4(%rdi)\n"
"movl %ecx, 8(%rdi)\n"
"ret\n"
);
}
__attribute__((noinline))
__attribute__((naked))
void sort3loop(int* data, int count)
{
__asm__(
"movl %esi, %r8d\n"
".loop:\n"
"movl 0(%rdi), %eax\n"
"movl 4(%rdi), %esi\n"
"movl 8(%rdi), %ecx\n"
"movl %ecx, %edx\n"
"cmpl %ecx, %eax\n"
"cmovg %eax, %ecx\n"
"cmovl %eax, %edx\n"
"movl %edx, %eax\n" // remove this line to discover the secret
"cmpl %esi, %edx\n"
"cmovg %esi, %eax\n"
"cmovg %edx, %esi\n"
"movl %eax, 0(%rdi)\n"
"movl %esi, 4(%rdi)\n"
"movl %ecx, 8(%rdi)\n"
"dec %r8d\n"
"jnz .loop\n"
"ret\n"
);
}
__attribute__((noinline))
__attribute__((naked))
void sort3loopreg(int* data, int count)
{
__asm__(
"movl %esi, %r8d\n"
".loopreg:\n"
"movl 0(%rdi), %eax\n"
"movl 4(%rdi), %esi\n"
"movl 8(%rdi), %ecx\n"
"movl %ecx, %edx\n"
"cmpl %ecx, %eax\n"
"cmovg %eax, %ecx\n"
"cmovl %eax, %edx\n"
"movl %edx, %eax\n" // remove this line to discover the secret
"cmpl %esi, %edx\n"
"cmovg %esi, %eax\n"
"cmovg %edx, %esi\n"
"dec %r8d\n"
"jnz .loopreg\n"
"movl %eax, 0(%rdi)\n"
"movl %esi, 4(%rdi)\n"
"movl %ecx, 8(%rdi)\n"
"ret\n"
);
}
int main()
{
int data[3] = { 3, 1, 2 };
// 6 cycles per iteration (including call overhead)
for (int i = 0; i < 1'000'000'000; ++i) sort3(data);
// 4 cycles per iteration (load/store memory)
sort3loop(data, 1'000'000'000);
// 3 cycles per iteration (sort registers to avoid inter-iteration dependencies)
sort3loopreg(data, 1'000'000'000);
return data[0] == 1 && data[1] == 2 && data[2] == 3 ? 0 : 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment