spion/a-warning.md

## a-warning.md

      
    Raw
  

              a-warning.md
            
          
    Warning

This benchmark has been misleading for a while. It was originally made to demonstrate how
JIT compilers can do all sorts of crazy stuff to your code - especially LuaJIT - and was
meant to be a starting point of discussion about what exactly LuaJIT does and how.
As a result, its not indicative of what its performance may be on more realistic data. Differences
can be expected because

the text will not consist of hard-coded constants
the number of words (and therefore the dictionary) would be larger, and JIT compilers for
JS and Lua often have special optimizations for small dictionaries/tables
the words wont be pre-split, and allocating new words adds significant performance penalty
(in that case a trie would probably outperform other approaches)


## benchmark
spion@missperfect:~/Projects/testhash$ gcc -O3 test.c -o testc
spion@missperfect:~/Projects/testhash$ time ./testc
The 1000000
a 2000000
at 1000000
brown 1000000
dog 1000000
era 1000000
fox 1000000
jumped 1000000
lake 1000000
lazy 1000000
new 1000000
near 1000000
of 1000000
over 1000000
quick 1000000
restaurant 1000000
the 2000000

real	0m0.234s
user	0m0.228s
sys	0m0.004s


spion@missperfect:~/Projects/testhash$ g++ -O3 -std=gnu++0x test.cpp -o test
spion@missperfect:~/Projects/testhash$ time ./test
the 2000000
at 1000000
a 2000000
brown 1000000
dog 1000000
era 1000000
fox 1000000
jumped 1000000
The 1000000
lake 1000000
lazy 1000000
new 1000000
near 1000000
of 1000000
over 1000000
quick 1000000
restaurant 1000000

real	0m0.321s
user	0m0.320s
sys	0m0.000s


spion@missperfect:~/Projects/testhash$ time node test.js
The 1000000
quick 1000000
brown 1000000
fox 1000000
jumped 1000000
over 1000000
the 2000000
lazy 1000000
dog 1000000
at 1000000
a 2000000
restaurant 1000000
near 1000000
lake 1000000
of 1000000
new 1000000
era 1000000

real	0m0.317s
user	0m0.304s
sys	0m0.012s


spion@missperfect:~/Projects/testhash$ time luajit test.lua
new 1000000
lazy 1000000
The 1000000
era 1000000
of 1000000
at 1000000
fox 1000000
jumped 1000000
quick 1000000
brown 1000000
lake 1000000
over 1000000
near 1000000
the 2000000
dog 1000000
a 2000000
restaurant 1000000

real	0m0.161s
user	0m0.160s
sys	0m0.000s

## test.c
// Like in the c++ version, we
// have a regular hash table
// with a custom hash function

#include <string.h>
#include <stdlib.h>
#include <stdio.h>

struct fht_node {
    char* key;
    void* data;
    struct fht_node * next;
};

typedef struct fht {
    struct fht_node ** nodes;
    size_t size;
} FHT;


FHT* fht_create() {
    FHT *ht = malloc(sizeof(struct fht));
    ht->size = 1 << (sizeof(char)*8);
    ht->nodes = calloc(ht->size, sizeof(struct fht_node));
    return ht;
}

fht_put(FHT* hashtbl, char* key, void* data) {
    struct fht_node *node = hashtbl->nodes[key[0]];

    while(node) {
        if(!strcmp(node->key, key)) {
            node->data=data;
            return 0;
        }
        node=node->next;
    }

    node=malloc(sizeof(struct fht_node));
    node->key= strdup(key);
    node->data=data;
    node->next=hashtbl->nodes[key[0]];
    hashtbl->nodes[key[0]]=node;
}

void* fht_get(FHT* hashtbl, char* key) {
    struct fht_node *node = hashtbl->nodes[key[0]];
    while (node) {
        if (!strcmp(node->key, key)) return node->data;
        node = node->next;
    }
    return NULL;
}

int main() {

    char* text[19] = {
        "The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a",
        "restaurant", "near", "the", "lake", "of", "a", "new", "era"
    };

    FHT *hashtbl = fht_create();

    int textLen = 19;

    int times = 1000000;
    int k, *cnt;


    while (times--)
        for (k = 0; k < textLen; ++k) {
            cnt = fht_get(hashtbl, text[k]);
            if (!cnt) {
                cnt = malloc(sizeof(int));
                *cnt = 1;
                fht_put(hashtbl, text[k], cnt);
            } else { *cnt += 1; }
        }


    for (k = 0; k < hashtbl->size; ++k) {
        struct fht_node *n = hashtbl->nodes[k];
        while (n) {
            printf("%s %d\n", n->key, *((int *)n->data));
            n = n->next;
        }
    }
}

## test.cpp
#include <unordered_map>
#include <string>
#include <vector>
#include <iostream>
#include <string>

using namespace std;

// In C++ despite only accessing the hashtable
// once per loop, we also need a custom hash function
// otherwise the program will be 2 times slower.

struct CustomHash {
    size_t operator() (const string& s) const { return (size_t)s[0]; }
};

int main() {

    vector<string> text = {"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"};
    unordered_map<string, int, CustomHash> cnt;
    int times = 1000000;
    while (times--)
        for (auto it = text.begin(); it != text.end(); ++it)
            cnt[*it] += 1;

    for (auto it = cnt.begin(); it != cnt.end(); ++it) {
        cout << it->first << " " << it->second << endl;
    }
}

## test.js
var text = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"],
    map = {};
    times = 1000001;

while (--times)
    for (var k = 0; k < text.length; ++k) {
        // Unlike luajit, if we put if (map[text[k]]) map[text[k]] += 1
        // in v8, this program will become 6 times slower. Instead we create a new object with a counter
        // and this way we'll only need to access the hashtable once per loop.
        var kth = map[text[k]];
        if (kth) kth.c += 1;
        else map[text[k]] = {c:1};
    }

for (var key in map) {
    console.log(key, map[key].c);
}

## test.lua
-- The lua program needs no optimizations.
-- It just works. Fast.
-- Note that luajit probably optimizes away
-- all dictionary lookups :)

local text = {"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"}
local map = {}
local times = 1000000

while times > 0 do
    for i = 1, table.getn(text), 1 do
        if map[text[i]] then map[text[i]] = map[text[i]] + 1
        else map[text[i]] = 1 end
    end
    times = times - 1;
end

for key, value in pairs(map) do
    io.write(key, " ", value, "\n")
end
	spion@missperfect:~/Projects/testhash$ gcc -O3 test.c -o testc
	spion@missperfect:~/Projects/testhash$ time ./testc
	The 1000000
	a 2000000
	at 1000000
	brown 1000000
	dog 1000000
	era 1000000
	fox 1000000
	jumped 1000000
	lake 1000000
	lazy 1000000
	new 1000000
	near 1000000
	of 1000000
	over 1000000
	quick 1000000
	restaurant 1000000
	the 2000000

	real 0m0.234s
	user 0m0.228s
	sys 0m0.004s




	spion@missperfect:~/Projects/testhash$ g++ -O3 -std=gnu++0x test.cpp -o test
	spion@missperfect:~/Projects/testhash$ time ./test
	the 2000000
	at 1000000
	a 2000000
	brown 1000000
	dog 1000000
	era 1000000
	fox 1000000
	jumped 1000000
	The 1000000
	lake 1000000
	lazy 1000000
	new 1000000
	near 1000000
	of 1000000
	over 1000000
	quick 1000000
	restaurant 1000000

	real 0m0.321s
	user 0m0.320s
	sys 0m0.000s



	spion@missperfect:~/Projects/testhash$ time node test.js
	The 1000000
	quick 1000000
	brown 1000000
	fox 1000000
	jumped 1000000
	over 1000000
	the 2000000
	lazy 1000000
	dog 1000000
	at 1000000
	a 2000000
	restaurant 1000000
	near 1000000
	lake 1000000
	of 1000000
	new 1000000
	era 1000000

	real 0m0.317s
	user 0m0.304s
	sys 0m0.012s



	spion@missperfect:~/Projects/testhash$ time luajit test.lua
	new 1000000
	lazy 1000000
	The 1000000
	era 1000000
	of 1000000
	at 1000000
	fox 1000000
	jumped 1000000
	quick 1000000
	brown 1000000
	lake 1000000
	over 1000000
	near 1000000
	the 2000000
	dog 1000000
	a 2000000
	restaurant 1000000

	real 0m0.161s
	user 0m0.160s
	sys 0m0.000s
	// Like in the c++ version, we
	// have a regular hash table
	// with a custom hash function

	#include <string.h>
	#include <stdlib.h>
	#include <stdio.h>

	struct fht_node {
	char* key;
	void* data;
	struct fht_node * next;
	};

	typedef struct fht {
	struct fht_node ** nodes;
	size_t size;
	} FHT;


	FHT* fht_create() {
	FHT *ht = malloc(sizeof(struct fht));
	ht->size = 1 << (sizeof(char)*8);
	ht->nodes = calloc(ht->size, sizeof(struct fht_node));
	return ht;
	}

	fht_put(FHT* hashtbl, char* key, void* data) {
	struct fht_node *node = hashtbl->nodes[key[0]];

	while(node) {
	if(!strcmp(node->key, key)) {
	node->data=data;
	return 0;
	}
	node=node->next;
	}

	node=malloc(sizeof(struct fht_node));
	node->key= strdup(key);
	node->data=data;
	node->next=hashtbl->nodes[key[0]];
	hashtbl->nodes[key[0]]=node;
	}

	void* fht_get(FHT* hashtbl, char* key) {
	struct fht_node *node = hashtbl->nodes[key[0]];
	while (node) {
	if (!strcmp(node->key, key)) return node->data;
	node = node->next;
	}
	return NULL;
	}

	int main() {

	char* text[19] = {
	"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a",
	"restaurant", "near", "the", "lake", "of", "a", "new", "era"
	};

	FHT *hashtbl = fht_create();

	int textLen = 19;

	int times = 1000000;
	int k, *cnt;


	while (times--)
	for (k = 0; k < textLen; ++k) {
	cnt = fht_get(hashtbl, text[k]);
	if (!cnt) {
	cnt = malloc(sizeof(int));
	*cnt = 1;
	fht_put(hashtbl, text[k], cnt);
	} else { *cnt += 1; }
	}


	for (k = 0; k < hashtbl->size; ++k) {
	struct fht_node *n = hashtbl->nodes[k];
	while (n) {
	printf("%s %d\n", n->key, ((int )n->data));
	n = n->next;
	}
	}
	}
	#include <unordered_map>
	#include <string>
	#include <vector>
	#include <iostream>
	#include <string>

	using namespace std;

	// In C++ despite only accessing the hashtable
	// once per loop, we also need a custom hash function
	// otherwise the program will be 2 times slower.

	struct CustomHash {
	size_t operator() (const string& s) const { return (size_t)s[0]; }
	};

	int main() {

	vector<string> text = {"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"};
	unordered_map<string, int, CustomHash> cnt;
	int times = 1000000;
	while (times--)
	for (auto it = text.begin(); it != text.end(); ++it)
	cnt[*it] += 1;

	for (auto it = cnt.begin(); it != cnt.end(); ++it) {
	cout << it->first << " " << it->second << endl;
	}
	}
	var text = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"],
	map = {};
	times = 1000001;

	while (--times)
	for (var k = 0; k < text.length; ++k) {
	// Unlike luajit, if we put if (map[text[k]]) map[text[k]] += 1
	// in v8, this program will become 6 times slower. Instead we create a new object with a counter
	// and this way we'll only need to access the hashtable once per loop.
	var kth = map[text[k]];
	if (kth) kth.c += 1;
	else map[text[k]] = {c:1};
	}

	for (var key in map) {
	console.log(key, map[key].c);
	}
	-- The lua program needs no optimizations.
	-- It just works. Fast.
	-- Note that luajit probably optimizes away
	-- all dictionary lookups :)

	local text = {"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"}
	local map = {}
	local times = 1000000

	while times > 0 do
	for i = 1, table.getn(text), 1 do
	if map[text[i]] then map[text[i]] = map[text[i]] + 1
	else map[text[i]] = 1 end
	end
	times = times - 1;
	end

	for key, value in pairs(map) do
	io.write(key, " ", value, "\n")
	end