Skip to content

Instantly share code, notes, and snippets.

View lemire's full-sized avatar
🚀
working hard and fast

Daniel Lemire lemire

🚀
working hard and fast
View GitHub Profile
@lemire
lemire / copybenchmark.c
Last active October 27, 2017 03:34
ARM NEON poor's man memcpy to sanity check the NEON performance
///////////////////
// A decent machine ought to be able to copy one 32-bit integer per CPU cycle
// using vectorized instructions. Furthermore, the vectorized loads and stores
// should be close to the performance of a memcpy, one would expect, for sizeable
// arrays.
//
// This tests the sanity of your ARM (aarch64) system:
//
// cc -O3 -o copybenchmark copybenchmark.c -std=c99
// ./copybenchmark
@lemire
lemire / simdinterleave.h
Created January 9, 2018 21:29
fast AVX bit-interleave
static inline __m256i interleave_uint8_with_zeros_avx(__m256i word) {
const __m256i m3 = _mm256_set1_epi64x(0x0f0f0f0f0f0f0f0f);
const __m256i m4 = _mm256_set1_epi64x(0x3333333333333333);
const __m256i m5 = _mm256_set1_epi64x(0x5555555555555555);
word = _mm256_xor_si256(word , _mm256_slli_epi16(word , 4));
word = _mm256_and_si256(word , m3);
word = _mm256_xor_si256(word , _mm256_slli_epi16(word , 2));
word = _mm256_and_si256(word , m4);
word = _mm256_xor_si256(word , _mm256_slli_epi16(word , 1));
word = _mm256_and_si256(word , m5);
@lemire
lemire / template_static_assert.cpp
Created February 28, 2018 02:48
checking template values with static_assert
template <int x>
class bobo {
static_assert (x>1,"bad");
};
int main() {
bobo <0> b;
}
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <time.h>
function compute(d) {
var start = 0, // incl
end = d; // excl
var byte_acc = 0,
bit_num = 0,
iter = 50,
limit = 4;
@lemire
lemire / mandelbrot.c
Last active June 27, 2018 19:07
Mandelbrot without putc
#include <stdio.h>
#include <stdlib.h>
//#include <malloc.h>
int main (int argc, char **argv)
{
int w, h, bit_num = 0;
char byte_acc = 0;
int i, iter = 50;
double x, y, limit = 2.0;
@lemire
lemire / fastaltmod.c
Created July 12, 2018 20:10
fast alternative to the modulo reduction (code sample)
#include <iostream>
#include <cstdint>
using namespace std;
// we are going to generate random numbers using
// an xorshift generator
static uint32_t xorshift_y;
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <vector>
class Person {
public:
Person() : _next(NULL), _prev(NULL) { _count = 0; }
@lemire
lemire / zeroupper.c
Created August 25, 2018 05:17
what does zero upper do?
#include <stdint.h>
#include <stdio.h>
#include <x86intrin.h>
int main() {
__m512i x = _mm512_set1_epi32(1);
__m512i xxxx = _mm512_set1_epi32(12);
__m256i xx = _mm256_set1_epi32(1);
asm volatile("vzeroupper\n" : "+x"(xx), "+x"(x), "+x"(xxxx) :);
@lemire
lemire / zeroupperdoesnot.c
Created August 25, 2018 05:43
zeroupperdoesnot.c
int main() {
__m512i vec = _mm512_set1_epi16(0xFFFF), vec2;
asm(
"vmovdqa32 %[vec], %%zmm31\n\t"
"vzeroupper\n\t"
"vmovdqa32 %%zmm31, %[vec2]\n\t"
: [vec] "+x" (vec), [vec2] "=x" (vec2) :: );
__m512i *a[] = {&vec, &vec2, 0}, **p = a;