Skip to content

Instantly share code, notes, and snippets.

@beezly
Created April 11, 2020 17:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beezly/4fe7a086ad1a1a7b02b2616a8ce08c9b to your computer and use it in GitHub Desktop.
Save beezly/4fe7a086ad1a1a7b02b2616a8ce08c9b to your computer and use it in GitHub Desktop.
MMX/SSE demo
/*
On a VM I got...
Allocating 134217728 bytes of RAM and writing a pattern to it.
allocate_ram: Took 15770 cycles
before first x86_add: Address 0xafd54010: 00000000000000000000000000000000
after first x86_add: Address 0xafd54010: 02020202020202020202020202020202
after last x86_add: Address 0xafd54010: cacacacacacacacacacacacacacacaca
x86_add: Took 13213002422 cycles
before first mmx_add: Address 0xafd54010: cacacacacacacacacacacacacacacaca
after first mmx add: Address 0xafd54010: cccccccccccccccccccccccccccccccc
after last mmx add: Address 0xafd54010: 94949494949494949494949494949494
mmx_add: Took 3867184649 cycles
mmx_add is 3.42 quicker than x86_add
before first sse_add: Address 0xafd54010: 94949494949494949494949494949494
after first sse_add: Address 0xafd54010: 96969696969696969696969696969696
after last sse_add: Address 0xafd54010: 5e5e5e5e5e5e5e5e5e5e5e5e5e5e5e5e
sse_add: Took 3817400474 cycles
sse_add is 3.46 quicker than x86_add
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <x86intrin.h>
// Compile with gcc -o demo -mmmx -msse4 ./demo.c
#define RAM_SIZE 16 * 1024 * 1024 * 8
#define CYCLE_COUNT 100
#define timing(task, start,end) printf("%s: Took %llu cycles\n", task, (end-start))
#define repeat(x) for(int i=0; i<CYCLE_COUNT; i++) { x; }
void display(char *msg, char *ram, int length) {
printf("%s: Address %p: ", msg, ram);
int i =0;
unsigned char* byte_array = ram;
while (i < length)
{
printf("%02hhx", byte_array[i]);
i++;
}
printf("\n");
return;
}
char *allocate_ram(size_t size) {
return calloc(size, sizeof(char));
}
void x86_add(uint8_t v, char *ram) {
asm __volatile(
"mov $0, %%eax \n"
"1: \n"
"addb %2, (%0,%%eax) \n"
"inc %%eax \n"
"cmp %1, %%eax \n"
"jne 1b"
:
: "r" (ram), "i" (RAM_SIZE), "r" (v)
: "memory", "%eax"
);
}
void mmx_add(uint8_t v, char *ram) {
uint64_t vmmx;
vmmx=v;
vmmx=(vmmx << 8) + v;
vmmx=(vmmx << 8) + v;
vmmx=(vmmx << 8) + v;
vmmx=(vmmx << 8) + v;
vmmx=(vmmx << 8) + v;
vmmx=(vmmx << 8) + v;
vmmx=(vmmx << 8) + v;
asm volatile (
"mov %0, %%esi \n"
"addl %1, %%esi \n"
"1: \n"
"movq (%2), %%mm0 \n"
"paddb (%0),%%mm0 \n"
"movq %%mm0, (%0) \n"
"add $8, %0 \n"
"cmp %%esi, %0 \n"
"jne 1b \n"
"emms \n"
:
: "r" (ram), "i" (RAM_SIZE), "r" (&vmmx)
: "memory", "%esi", "mm0"
);
}
void sse_add(uint8_t v, char *ram) {
asm __volatile(
"mov %1, %%eax \n"
"add %0, %%eax \n" // find our end point
"mov %2, %%bh \n" // fill xmm1 with our add value
"mov %2, %%bl \n"
"pinsrw $0, %%ebx, %%xmm1 \n"
"pinsrw $1, %%ebx, %%xmm1 \n"
"pinsrw $2, %%ebx, %%xmm1 \n"
"pinsrw $3, %%ebx, %%xmm1 \n"
"pinsrw $4, %%ebx, %%xmm1 \n"
"pinsrw $5, %%ebx, %%xmm1 \n"
"pinsrw $6, %%ebx, %%xmm1 \n"
"pinsrw $7, %%ebx, %%xmm1 \n"
"2: \n"
"vpaddb (%0), %%xmm1, %%xmm0 \n" // add xm1 to data @%2 and write into xm0
"movq %%xmm0, (%0) \n"
"movhlps %%xmm0, %%xmm0 \n" //shift top half of xmm0 into bottom half.
"add $8, %0 \n" // skip 8 bytes
"movq %%xmm0, (%0) \n"
"add $8, %0 \n" // skip 8 bytes
"cmp %0, %%eax \n"
"jne 2b"
:
: "r" (ram), "i" (RAM_SIZE), "r" (v)
: "memory", "%eax", "%ebx", "%xmm1", "%xmm0"
);
}
int main(int argc, char *argv) {
char *ram;
uint64_t start_time, end_time;
printf("Allocating %i bytes of RAM and writing a pattern to it.\n", RAM_SIZE);
start_time=__rdtsc();
ram=allocate_ram(RAM_SIZE);
end_time=__rdtsc();
timing("allocate_ram",start_time,end_time);
display("before first x86_add", ram,16);
start_time=__rdtsc();
x86_add(2,ram);
display(" after first x86_add",ram,16);
repeat(x86_add(2,ram));
end_time=__rdtsc();
display(" after last x86_add",ram,16);
timing("x86_add",start_time,end_time);
unsigned long long x86_add_time = end_time-start_time;
display("before first mmx_add",ram,16);
start_time=__rdtsc();
mmx_add(2,ram);
display(" after first mmx add",ram,16);
repeat(mmx_add(2,ram));
end_time=__rdtsc();
display(" after last mmx add",ram,16);
timing("mmx_add",start_time,end_time);
unsigned long long mmx_add_time = end_time-start_time;
printf("mmx_add is %.2f quicker than x86_add\n", (float) x86_add_time / (float) mmx_add_time);
//sse
display("before first sse_add",ram,16);
start_time=__rdtsc();
sse_add(2,ram);
display(" after first sse_add",ram,16);
repeat(sse_add(2,ram));
end_time=__rdtsc();
display(" after last sse_add",ram,16);
timing("sse_add",start_time,end_time);
unsigned long long sse_add_time = end_time-start_time;
printf("sse_add is %.2f quicker than x86_add\n", (float) x86_add_time / (float) sse_add_time);
exit(0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment