Skip to content

Instantly share code, notes, and snippets.

View lemire's full-sized avatar
🚀
working hard and fast

Daniel Lemire lemire

🚀
working hard and fast
View GitHub Profile
@lemire
lemire / simdinterleave.h
Created January 9, 2018 21:29
fast AVX bit-interleave
static inline __m256i interleave_uint8_with_zeros_avx(__m256i word) {
const __m256i m3 = _mm256_set1_epi64x(0x0f0f0f0f0f0f0f0f);
const __m256i m4 = _mm256_set1_epi64x(0x3333333333333333);
const __m256i m5 = _mm256_set1_epi64x(0x5555555555555555);
word = _mm256_xor_si256(word , _mm256_slli_epi16(word , 4));
word = _mm256_and_si256(word , m3);
word = _mm256_xor_si256(word , _mm256_slli_epi16(word , 2));
word = _mm256_and_si256(word , m4);
word = _mm256_xor_si256(word , _mm256_slli_epi16(word , 1));
word = _mm256_and_si256(word , m5);
@lemire
lemire / template_static_assert.cpp
Created February 28, 2018 02:48
checking template values with static_assert
template <int x>
class bobo {
static_assert (x>1,"bad");
};
int main() {
bobo <0> b;
}
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <time.h>
function compute(d) {
var start = 0, // incl
end = d; // excl
var byte_acc = 0,
bit_num = 0,
iter = 50,
limit = 4;
@lemire
lemire / mandelbrot.c
Last active June 27, 2018 19:07
Mandelbrot without putc
#include <stdio.h>
#include <stdlib.h>
//#include <malloc.h>
int main (int argc, char **argv)
{
int w, h, bit_num = 0;
char byte_acc = 0;
int i, iter = 50;
double x, y, limit = 2.0;
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <vector>
class Person {
public:
Person() : _next(NULL), _prev(NULL) { _count = 0; }
@lemire
lemire / zeroupper.c
Created August 25, 2018 05:17
what does zero upper do?
#include <stdint.h>
#include <stdio.h>
#include <x86intrin.h>
int main() {
__m512i x = _mm512_set1_epi32(1);
__m512i xxxx = _mm512_set1_epi32(12);
__m256i xx = _mm256_set1_epi32(1);
asm volatile("vzeroupper\n" : "+x"(xx), "+x"(x), "+x"(xxxx) :);
@lemire
lemire / zeroupperdoesnot.c
Created August 25, 2018 05:43
zeroupperdoesnot.c
int main() {
__m512i vec = _mm512_set1_epi16(0xFFFF), vec2;
asm(
"vmovdqa32 %[vec], %%zmm31\n\t"
"vzeroupper\n\t"
"vmovdqa32 %%zmm31, %[vec2]\n\t"
: [vec] "+x" (vec), [vec2] "=x" (vec2) :: );
__m512i *a[] = {&vec, &vec2, 0}, **p = a;
@lemire
lemire / avx512andzero.c
Last active August 25, 2018 14:53
It appears that any libcall is enough to get the registers dirty on AVX-512 capable hardware
/**
$ gcc -O2 -o fun fun.c -march=native -mno-avx512f && perf stat ./fun a b c 2>&1 |grep GHz && perf stat ./fun a b 2>&1 |grep GHz && perf stat ./fun a 2>&1 |grep GHz && perf stat ./fun 2>&1 |grep GHz
40,718,941 cycles # 3.167 GHz
40,719,004 cycles # 3.163 GHz
40,735,489 cycles # 2.796 GHz
40,707,631 cycles # 2.796 GHz
**/
#include <x86intrin.h>
#include <stdlib.h>
@lemire
lemire / badavx512.c
Created September 1, 2018 00:39
Why no throttling?
/***
$ cc -O3 -g3 -o badavx512 badavx512.c -march=native
$ perf stat ./badavx512
33686018
Performance counter stats for './badavx512':
1963.502013 task-clock (msec) # 1.000 CPUs utilized
3 context-switches # 0.002 K/sec