Last active

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

C++ versus V8 versus luajit versus C benchmark - (hash) tables

View benchmark
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
spion@missperfect:~/Projects/testhash$ gcc -O3 test.c -o testc
spion@missperfect:~/Projects/testhash$ time ./testc
The 1000000
a 2000000
at 1000000
brown 1000000
dog 1000000
era 1000000
fox 1000000
jumped 1000000
lake 1000000
lazy 1000000
new 1000000
near 1000000
of 1000000
over 1000000
quick 1000000
restaurant 1000000
the 2000000
 
real 0m0.234s
user 0m0.228s
sys 0m0.004s
 
 
 
 
spion@missperfect:~/Projects/testhash$ g++ -O3 -std=gnu++0x test.cpp -o test
spion@missperfect:~/Projects/testhash$ time ./test
the 2000000
at 1000000
a 2000000
brown 1000000
dog 1000000
era 1000000
fox 1000000
jumped 1000000
The 1000000
lake 1000000
lazy 1000000
new 1000000
near 1000000
of 1000000
over 1000000
quick 1000000
restaurant 1000000
 
real 0m0.321s
user 0m0.320s
sys 0m0.000s
 
 
 
spion@missperfect:~/Projects/testhash$ time node test.js
The 1000000
quick 1000000
brown 1000000
fox 1000000
jumped 1000000
over 1000000
the 2000000
lazy 1000000
dog 1000000
at 1000000
a 2000000
restaurant 1000000
near 1000000
lake 1000000
of 1000000
new 1000000
era 1000000
 
real 0m0.317s
user 0m0.304s
sys 0m0.012s
 
 
 
spion@missperfect:~/Projects/testhash$ time luajit test.lua
new 1000000
lazy 1000000
The 1000000
era 1000000
of 1000000
at 1000000
fox 1000000
jumped 1000000
quick 1000000
brown 1000000
lake 1000000
over 1000000
near 1000000
the 2000000
dog 1000000
a 2000000
restaurant 1000000
 
real 0m0.161s
user 0m0.160s
sys 0m0.000s
View benchmark
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
// Like in the c++ version, we
// have a regular hash table
// with a custom hash function
 
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
 
struct fht_node {
char* key;
void* data;
struct fht_node * next;
};
 
typedef struct fht {
struct fht_node ** nodes;
size_t size;
} FHT;
 
 
FHT* fht_create() {
FHT *ht = malloc(sizeof(struct fht));
ht->size = 1 << (sizeof(char)*8);
ht->nodes = calloc(ht->size, sizeof(struct fht_node));
return ht;
}
 
fht_put(FHT* hashtbl, char* key, void* data) {
struct fht_node *node = hashtbl->nodes[key[0]];
 
while(node) {
if(!strcmp(node->key, key)) {
node->data=data;
return 0;
}
node=node->next;
}
 
node=malloc(sizeof(struct fht_node));
node->key= strdup(key);
node->data=data;
node->next=hashtbl->nodes[key[0]];
hashtbl->nodes[key[0]]=node;
}
 
void* fht_get(FHT* hashtbl, char* key) {
struct fht_node *node = hashtbl->nodes[key[0]];
while (node) {
if (!strcmp(node->key, key)) return node->data;
node = node->next;
}
return NULL;
}
 
int main() {
 
char* text[19] = {
"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a",
"restaurant", "near", "the", "lake", "of", "a", "new", "era"
};
 
FHT *hashtbl = fht_create();
 
int textLen = 19;
 
int times = 1000000;
int k, *cnt;
 
 
while (times--)
for (k = 0; k < textLen; ++k) {
cnt = fht_get(hashtbl, text[k]);
if (!cnt) {
cnt = malloc(sizeof(int));
*cnt = 1;
fht_put(hashtbl, text[k], cnt);
} else { *cnt += 1; }
}
 
for (k = 0; k < hashtbl->size; ++k) {
struct fht_node *n = hashtbl->nodes[k];
while (n) {
printf("%s %d\n", n->key, *((int *)n->data));
n = n->next;
}
}
}
View benchmark
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
#include <unordered_map>
#include <string>
#include <vector>
#include <iostream>
#include <string>
 
using namespace std;
 
// In C++ despite only accessing the hashtable
// once per loop, we also need a custom hash function
// otherwise the program will be 2 times slower.
 
struct CustomHash {
size_t operator() (const string& s) const { return (size_t)s[0]; }
};
 
int main() {
 
vector<string> text = {"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"};
unordered_map<string, int, CustomHash> cnt;
int times = 1000000;
while (times--)
for (auto it = text.begin(); it != text.end(); ++it)
cnt[*it] += 1;
 
for (auto it = cnt.begin(); it != cnt.end(); ++it) {
cout << it->first << " " << it->second << endl;
}
}
View benchmark
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
var text = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"],
map = {};
times = 1000001;
 
while (--times)
for (var k = 0; k < text.length; ++k) {
// Unlike luajit, if we put if (map[text[k]]) map[text[k]] += 1
// in v8, this program will become 6 times slower. Instead we create a new object with a counter
// and this way we'll only need to access the hashtable once per loop.
var kth = map[text[k]];
if (kth) kth.c += 1;
else map[text[k]] = {c:1};
}
 
for (var key in map) {
console.log(key, map[key].c);
}
View benchmark
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
-- The lua program needs no optimizations.
-- It just works. Fast.
-- Note that luajit probably optimizes away
-- all dictionary lookups :)
 
local text = {"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"}
local map = {}
local times = 1000000
 
while times > 0 do
for i = 1, table.getn(text), 1 do
if map[text[i]] then map[text[i]] = map[text[i]] + 1
else map[text[i]] = 1 end
end
times = times - 1;
end
 
for key, value in pairs(map) do
io.write(key, " ", value, "\n")
end

MoonScript version:

text = {"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"}
map = {}
times = 1000000

while times > 0
  for i = 1, table.getn(text), 1
    if map[text[i]] then map[text[i]] += 1
    else map[text[i]] = 1
  times -= 1

for key, value in pairs map
  io.write(key, " ", value, "\n")

Works just as fast as LuaJIT one.

Calling table.getn() in a loop in the Lua version is a bad idea, since in Lua getting the length of a table is not O(1).

On this machine replacing this:

while times > 0 do
    for i = 1, table.getn(text), 1 do

by this:

local sz = #text
while times > 0 do  
    for i = 1, sz do

makes the runtime go down from 87ms to 65ms.

@catwell +1

Owner

Interesting. In lua its slightly less obvious that the function gets called on every iteration.

The main point of this comparison was probably this though:

  • luajit is terrifyingly smart and defeats many attempts at (micro)benchmarking :D
  • i did not need to do anything special to get really good results.

Very interesting :)
However the C implementation does seem to do some unncessary allocation/copying, which might or might not explain the astonishing result that a JIT is faster than plain C.
Also prabbly the hash table implementation in lua is better (in whatever way) than that in the C version i suppose?

  1. LuaJIT interns all strings, so that, lookup in a hash is done by string address instead of whole string hash
  2. it looks like it remembers address of position in a hash, so that map[text[i]] = map[text[i]] + 1 becomes (*(double*)&map[test[i]])++

luajit -jdump test.lua

@spion To be clear, table.getn was not called on each iteration of the for loop, but it was called on each iteration of the while loop.

The C version has memory leaks, so, it is not 100% correct and thus the benchmark info isn't precise.

varnie@localhost:~$ valgrind --leak-check=yes ~/thrash/testc

==14344== Memcheck, a memory error detector
==14344== Copyright (C) 2002-2011, and GNU GPL'd, by Julian Seward et al.
==14344== Using Valgrind-3.7.0 and LibVEX; rerun with -h for copyright info
==14344== Command: /home/varnie/thrash/testc
==14344==
The 1000000
a 2000000
at 1000000
brown 1000000
dog 1000000
era 1000000
fox 1000000
jumped 1000000
lake 1000000
lazy 1000000
new 1000000
near 1000000
of 1000000
over 1000000
quick 1000000
restaurant 1000000
the 2000000
==14344==
==14344== HEAP SUMMARY:
==14344== in use at exit: 3,434 bytes in 53 blocks
==14344== total heap usage: 53 allocs, 0 frees, 3,434 bytes allocated
==14344==
==14344== 3,434 (8 direct, 3,426 indirect) bytes in 1 blocks are definitely lost in loss record 5 of 5
==14344== at 0x402BB7A: malloc (in /usr/lib/valgrind/vgpreload_memcheck-x86-linux.so)
==14344== by 0x804867F: fht_create (in /home/varnie/thrash/testc)
==14344== by 0x40684D2: (below main) (libc-start.c:226)
==14344==
==14344== LEAK SUMMARY:
==14344== definitely lost: 8 bytes in 1 blocks
==14344== indirectly lost: 3,426 bytes in 52 blocks
==14344== possibly lost: 0 bytes in 0 blocks
==14344== still reachable: 0 bytes in 0 blocks
==14344== suppressed: 0 bytes in 0 blocks
==14344==
==14344== For counts of detected and suppressed errors, rerun with: -v
==14344== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)

Here's the corrected C version:

// Like in the c++ version, we
// have a regular hash table
// with a custom hash function

#include <string.h>
#include <stdlib.h>
#include <stdio.h>

struct fht_node {
    char* key;
    void* data;
    struct fht_node * next;
};

typedef struct fht {
    struct fht_node ** nodes;
    size_t size;
} FHT;


FHT* fht_create() {
    FHT *ht = malloc(sizeof(struct fht));
    ht->size = 1 << (sizeof(char)*8);
    ht->nodes = calloc(ht->size, sizeof(struct fht_node));
    return ht;
}

fht_put(FHT* hashtbl, char* key, void* data) {
    struct fht_node *node = hashtbl->nodes[key[0]];

    while(node) {
        if(!strcmp(node->key, key)) {
            node->data=data;
            return 0;
        }

        node=node->next;
    }

    node=malloc(sizeof(struct fht_node));
    node->key= strdup(key);
    node->data=data;
    node->next=hashtbl->nodes[key[0]];
    hashtbl->nodes[key[0]]=node;
}

void* fht_get(FHT* hashtbl, char* key) {
    struct fht_node *node = hashtbl->nodes[key[0]];
    while (node) {
        if (!strcmp(node->key, key)) return node->data;
        node = node->next;
    }
    return NULL;
}

fht_dealloc(FHT *hashtbl) {

    int k;
    for (k = 0; k < hashtbl->size; ++k) {
        struct fht_node *n = hashtbl->nodes[k];
        while (n) {
            struct fht_node *old_n;
            old_n = n;

            n = old_n->next;

            free(old_n->key);
            free(old_n->data);
            free(old_n);
        }
    }

    free(hashtbl->nodes);

    free(hashtbl);
}

int main() {

    char* text[19] = {
        "The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a",
        "restaurant", "near", "the", "lake", "of", "a", "new", "era"
    };

    FHT *hashtbl = fht_create();

    int textLen = 19;

    int times = 1000000;
    int k, *cnt;

    while (times--) {
        for (k = 0; k < textLen; ++k) {
            cnt = fht_get(hashtbl, text[k]);
            if (!cnt) {
                cnt = malloc(sizeof(int));
                *cnt = 1;
                fht_put(hashtbl, text[k], cnt);
            } else { *cnt += 1; }
        }
    }

    for (k = 0; k < hashtbl->size; ++k) {
        struct fht_node *n = hashtbl->nodes[k];
        while (n) {
            printf("%s %d\n", n->key, *((int *)n->data));
            n = n->next;
        }
    }

    fht_dealloc(hashtbl);
}
mafik commented

Freeing memory before end of program is a waste of time.

it is nonsense (no matter that in some scenarios it might work without a severe impact). It is praising the ignorance which good programmers avoid.

Here's a Nimrod version:

import tables

var 
   words = ["The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"]
   hash = initTable[string, int](32)
   times = 1000000

while times > 0:
   for i in 0..len(words)-1:
      hash[words[i]] = `[]`(hash, words[i]) + 1
   times = times - 1

for kv in pairs(hash):
   echo kv.key & ", " & $kv.val

As it turns out, it performs terribly:
real 0m6.106s
user 0m6.093s
sys 0m0.003s

Basic version in D language (more than twice slower than the C++ version, so probably it's about as fast as the C++ version without CustomHash):

void main() {
    import std.stdio, std.string;

    const words = "The quick brown fox jumped over the lazy dog
                   at a restaurant near the lake of a new era".split;

    uint[string] counts;

    foreach (immutable _; 0 .. 1_000_000)
        foreach (immutable word; words)
            counts[word]++;

    writefln("%-(%s %d\n%)", counts);
}

A bit smarter version (a little faster than the C++ version):

void main() {
    import std.stdio, std.algorithm, std.array;

    const words = [
        "The", "quick", "brown", "fox", "jumped", "over",
        "the", "lazy", "dog", "at", "a", "restaurant",
        "near", "the", "lake", "of", "a", "new", "era"]
        .map!(w => w.ptr).array;

    uint[immutable(char)*] counts;

    foreach (immutable _; 0 .. 1_000_000)
        foreach (immutable w; words)
            counts[w]++;

    foreach (w, count; counts)
        printf("%s %u\n", w, count);
}

Here is a faster (than luajit) C++ version. There was no need to use std::string since all strings in this micro-benchmark are constants and not using any of std::string's facilities. Same thing with std::vector, it could be replaced with std::array.

I also changed the hash function's implementation to use the trick @funny-falcon mentioned.

#include <unordered_map>
#include <array>
#include <iostream>

using namespace std;

struct CustomHash {
    size_t operator() (char const* s) const { return reinterpret_cast<size_t>(s); }
};


int main() {
    array<char const*, 19> text = { "The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era" };
    unordered_map<char const*, int, CustomHash> cnt;
    int times = 1000000;
    while (times--)
        for (auto const& word : text)
            cnt[word] += 1;


    for (auto const& keyval : cnt) {
        cout << keyval.first << " " << keyval.second << endl;
    }
    return 0;
}

@marcomagdy With g++ -O3 that runs in 0.495s for me vs 0.186s for LuaJIT with the code below (~15% faster than the original).

local text = {"The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog", "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"}
local len = #text
local map = {}
local times = 1000000

for i = 1, times do
    for j = 1, len do
        local k = text[j]
        map[k] = (map[k] or 0) + 1
    end
end

for key, value in pairs(map) do
    io.write(key, " ", value, "\n")
end

// in v8, this program will become 6 times slower. Instead we create a new object with a counter
// and this way we'll only need to access the hashtable once per loop.

^^^ and why do you force a hash lookup in the C++ loop ?

I've forked this gist and updated the C++ implementation to one that's over 3 times faster on the machine I was testing on (110ms vs 410ms). See https://gist.github.com/jh3141/768a9672dba9a8fc2524

luajit is still a little faster, but it's very close now.

@nurettin - the C++ version only does one hash lookup per string per iteration -- unlike in javascript, accessing an uninitialised entry in the map initialises it to zero (as that's what int's default constructor does), so you can just increment without checking it first.

@marcomagdy - just changing to const char * rather than string is somewhat dubious, as it relies on == comparing the strings correctly, which comes down to whether or not the linker merges duplicated strings (or whether they were read from some other source rather than compiled code...). I decided to use a set to scan over the array once at the start of the process and ensure all duplicated strings were using the same pointer. Note that both v8 and luajit will do exactly the same thing, although possibly with a slightly more specialised data structure... but for 19 strings, that really doesn't matter as it takes basically no time at all.

And I finally got C++ down to 90ms, just edging out luajit at 95ms. Had to use a custom data structure to achieve it, but then both JITs use similar structures internally, which are optimized for holding a very small number of items with fairly fixed access patterns...

And for this implementation, clang++ is faster than g++, clocking in at 71ms (median of 5 runs).

Rust version:

use std::collections::HashMap;
use std::collections::hash_map::Entry;

fn main() {
  let text = [
    "The", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog",
    "at", "a", "restaurant", "near", "the", "lake", "of", "a", "new", "era"
  ];

  let mut map: HashMap<&str, u32> = HashMap::new();

  for _ in range(0u32, 100000) {
    for item in text.iter() {
      match map.entry(*item) {
        Entry::Occupied(mut e) => { *e.get_mut() += 1; },
        Entry::Vacant(e)       => { e.set(1u32); }
      };
    }
  }

  for (k, v) in map.iter() {
    println!("{} {}", k, v)
  }
}

Time for Rust version:

real    0m0.104s
user    0m0.100s
sys 0m0.000s

Time for C version:

real    0m0.179s
user    0m0.176s
sys 0m0.000s

Hello, I am not a coder and all the above is too complicated for me, after all your benchmark study would you have a clear comparison table that shows wich one from C++, V8, luajit or C is the most lightweight programing language and has the most very small memory footprint???

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.