Skip to content

Instantly share code, notes, and snippets.

@ChayimFriedman2
Last active October 25, 2020 18:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ChayimFriedman2/781f588d1deff4ab82173e2634d95058 to your computer and use it in GitHub Desktop.
Save ChayimFriedman2/781f588d1deff4ab82173e2634d95058 to your computer and use it in GitHub Desktop.
Benchmark of various ways to lex (scan) keywords

In the comparison: simple linear lookup, trie (prefix tree) and perfect hash (generated using gperf).

I compared the keywords of the Wren programming language, using the code of the core (at https://github.com/wren-lang/wren/blob/ad4e039187bda492709786762d4a36ec0b00d91c/src/vm/wren_core.wren). First I isolated the identifiers (including keywords) to std::array<std::string> (total 855 identifiers), then benchmarked the various methods to identify keywords using Google Benchmark with https://quick-bench.com (I don't know how to share links to benchmarks in https://quick-bench.com, sorry). I did not reviewed the generated assembly code, although I hope to do that in the near future.

The results are quite surprising: of course, linear lookup is the slowest. But the others depend on the compiler: with Clang perfect hash is the fastest, while with GCC trie takes. Overall, Clang is better at linear search (the difference in GCC is so much bigger), but lose to GCC with trie and perfect hash.

GCC:

gcc

Clang:

clang

Benchmark code:

#include <array>
#include <string>
#include <cstring>

typedef enum
{
  TOKEN_LEFT_PAREN,
  TOKEN_RIGHT_PAREN,
  TOKEN_LEFT_BRACKET,
  TOKEN_RIGHT_BRACKET,
  TOKEN_LEFT_BRACE,
  TOKEN_RIGHT_BRACE,
  TOKEN_COLON,
  TOKEN_DOT,
  TOKEN_DOTDOT,
  TOKEN_DOTDOTDOT,
  TOKEN_COMMA,
  TOKEN_STAR,
  TOKEN_SLASH,
  TOKEN_PERCENT,
  TOKEN_PLUS,
  TOKEN_MINUS,
  TOKEN_LTLT,
  TOKEN_GTGT,
  TOKEN_PIPE,
  TOKEN_PIPEPIPE,
  TOKEN_CARET,
  TOKEN_AMP,
  TOKEN_AMPAMP,
  TOKEN_BANG,
  TOKEN_TILDE,
  TOKEN_QUESTION,
  TOKEN_EQ,
  TOKEN_LT,
  TOKEN_GT,
  TOKEN_LTEQ,
  TOKEN_GTEQ,
  TOKEN_EQEQ,
  TOKEN_BANGEQ,

  TOKEN_BREAK,
  TOKEN_CLASS,
  TOKEN_CONSTRUCT,
  TOKEN_ELSE,
  TOKEN_FALSE,
  TOKEN_FOR,
  TOKEN_FOREIGN,
  TOKEN_IF,
  TOKEN_IMPORT,
  TOKEN_IN,
  TOKEN_IS,
  TOKEN_NULL,
  TOKEN_RETURN,
  TOKEN_STATIC,
  TOKEN_SUPER,
  TOKEN_THIS,
  TOKEN_TRUE,
  TOKEN_VAR,
  TOKEN_WHILE,

  TOKEN_FIELD,
  TOKEN_STATIC_FIELD,
  TOKEN_NAME,
  TOKEN_NUMBER,

  // A string literal without any interpolation, or the last section of a
  // string following the last interpolated expression.
  TOKEN_STRING,

  // A portion of a string literal preceding an interpolated expression. This
  // string:
  //
  //     "a %(b) c %(d) e"
  //
  // is tokenized to:
  //
  //     TOKEN_INTERPOLATION "a "
  //     TOKEN_NAME          b
  //     TOKEN_INTERPOLATION " c "
  //     TOKEN_NAME          d
  //     TOKEN_STRING        " e"
  TOKEN_INTERPOLATION,

  TOKEN_LINE,

  TOKEN_ERROR,
  TOKEN_EOF
} TokenType;

typedef struct
{
  const char *identifier;
  size_t length;
  TokenType tokenType;
} Keyword;

static Keyword keywords[] =
    {
        {"break", 5, TOKEN_BREAK},
        {"class", 5, TOKEN_CLASS},
        {"construct", 9, TOKEN_CONSTRUCT},
        {"else", 4, TOKEN_ELSE},
        {"false", 5, TOKEN_FALSE},
        {"for", 3, TOKEN_FOR},
        {"foreign", 7, TOKEN_FOREIGN},
        {"if", 2, TOKEN_IF},
        {"import", 6, TOKEN_IMPORT},
        {"in", 2, TOKEN_IN},
        {"is", 2, TOKEN_IS},
        {"null", 4, TOKEN_NULL},
        {"return", 6, TOKEN_RETURN},
        {"static", 6, TOKEN_STATIC},
        {"super", 5, TOKEN_SUPER},
        {"this", 4, TOKEN_THIS},
        {"true", 4, TOKEN_TRUE},
        {"var", 3, TOKEN_VAR},
        {"while", 5, TOKEN_WHILE},
        {NULL, 0, TOKEN_EOF} // Sentinel to mark the end of the array.
};

TokenType SimpleLookup(const char *identifier, size_t length)
{
  for (int i = 0; keywords[i].identifier != NULL; i++)
  {
    if (length == keywords[i].length &&
        memcmp(identifier, keywords[i].identifier, length) == 0)
    {
      return keywords[i].tokenType;
    }
  }
  return TOKEN_NAME;
}

TokenType Trie(const char *identifier, size_t length)
{
  if (length >= 2)
  {
    if (identifier[0] == 'b')
    {
      return length == 5 && memcmp(identifier + 1, "reak", 4) == 0 ? TOKEN_BREAK : TOKEN_NAME;
    }
    else if (identifier[0] == 'c')
    {
      if (identifier[0] == 'l')
      {
        return length == 5 && memcmp(identifier + 2, "ass", 3) == 0 ? TOKEN_CLASS : TOKEN_NAME;
      }
      return length == 9 && memcmp(identifier + 1, "onstruct", 8) == 0 ? TOKEN_CONSTRUCT : TOKEN_NAME;
    }
    else if (identifier[0] == 'e')
    {
      return length == 4 && memcmp(identifier + 1, "lse", 3) == 0 ? TOKEN_ELSE : TOKEN_NAME;
    }
    else if (identifier[0] == 'f')
    {
      if (identifier[0] == 'a')
      {
        return length == 5 && memcmp(identifier + 2, "lse", 3) == 0 ? TOKEN_FALSE : TOKEN_NAME;
      }
      else if (identifier[0] == 'o')
      {
        if (identifier[0] == 'r')
        {
          if (length == 3)
          {
            return TOKEN_FOR;
          }
          return length == 7 && memcmp(identifier + 3, "eign", 4) == 0 ? TOKEN_FOREIGN : TOKEN_NAME;
        }
      }
    }
    else if (identifier[0] == 'i')
    {
      if (length == 2)
      {
        return identifier[1] == 'f' ? TOKEN_IF : identifier[1] == 'n' ? TOKEN_IN : identifier[1] == 's' ? TOKEN_IS : TOKEN_NAME;
      }
      return length == 6 && memcmp(identifier + 1, "mport", 5) == 0 ? TOKEN_IMPORT : TOKEN_NAME;
    }
    else if (identifier[0] == 'n')
    {
      return length == 4 && memcmp(identifier + 1, "ull", 3) == 0 ? TOKEN_NULL : TOKEN_NAME;
    }
    else if (identifier[0] == 'r')
    {
      return length == 6 && memcmp(identifier + 1, "eturn", 5) == 0 ? TOKEN_RETURN : TOKEN_NAME;
    }
    else if (identifier[0] == 's')
    {
      if (identifier[0] == 't')
      {
        return length == 6 && memcmp(identifier + 2, "atic", 4) == 0 ? TOKEN_STATIC : TOKEN_NAME;
      }
      return length == 5 && memcmp(identifier + 1, "uper", 4) == 0 ? TOKEN_SUPER : TOKEN_NAME;
    }
    else if (identifier[0] == 't')
    {
      if (length == 4)
      {
        if (identifier[0] == 'h')
        {
          return memcmp(identifier + 2, "is", 2) == 0 ? TOKEN_THIS : TOKEN_NAME;
        }
        return memcmp(identifier + 1, "rue", 3) == 0 ? TOKEN_TRUE : TOKEN_NAME;
      }
    }
    else if (identifier[0] == 'v')
    {
      return length == 3 && memcmp(identifier + 1, "ar", 2) == 0 ? TOKEN_VAR : TOKEN_NAME;
    }
    else if (identifier[0] == 'w')
    {
      return length == 5 && memcmp(identifier + 1, "hile", 4) == 0 ? TOKEN_WHILE : TOKEN_NAME;
    }
  }
  return TOKEN_NAME;
}

const std::array<std::string, 855> x {
    "class", "Bool", "class", "Fiber", "class", "Fn", "class", "Null", "class", "Num", "class",
    "Sequence", "all", "f", "var", "result", "true", "for", "element", "in", "this", "result",
    "f", "call", "element", "if", "result", "return", "result", "return", "result", "any", "f",
    "var", "result", "false", "for", "element", "in", "this", "result", "f", "call", "element",
    "if", "result", "return", "result", "return", "result", "contains", "element", "for", "item",
    "in", "this", "if", "element", "item", "return", "true", "return", "false", "count", "var",
    "result", "for", "element", "in", "this", "result", "result", "return", "result", "count",
    "f", "var", "result", "for", "element", "in", "this", "if", "f", "call", "element", "result",
    "result", "return", "result", "each", "f", "for", "element", "in", "this", "f", "call",
    "element", "isEmpty", "iterate", "null", "false", "true", "map", "transformation", "MapSequence",
    "new", "this", "transformation", "skip", "count", "if", "count", "is", "Num", "count",
    "isInteger", "count", "Fiber", "abort", "return", "SkipSequence", "new", "this", "count",
    "take", "count", "if", "count", "is", "Num", "count", "isInteger", "count", "Fiber", "abort",
    "return", "TakeSequence", "new", "this", "count", "where", "predicate", "WhereSequence", "new",
    "this", "predicate", "reduce", "acc", "f", "for", "element", "in", "this", "acc", "f", "call",
    "acc", "element", "return", "acc", "reduce", "f", "var", "iter", "iterate", "null", "if",
    "iter", "Fiber", "abort", "var", "result", "iteratorValue", "iter", "while", "iter", "iterate",
    "iter", "result", "f", "call", "result", "iteratorValue", "iter", "return", "result", "join",
    "join", "join", "sep", "var", "first", "true", "var", "result", "for", "element", "in", "this",
    "if", "first", "result", "result", "sep", "first", "false", "result", "result", "element",
    "toString", "return", "result", "toList", "var", "result", "List", "new", "for", "element",
    "in", "this", "result", "add", "element", "return", "result", "class", "MapSequence", "is",
    "Sequence", "construct", "new", "sequence", "fn", "_sequence", "sequence", "_fn", "fn",
    "iterate", "iterator", "_sequence", "iterate", "iterator", "iteratorValue", "iterator", "_fn",
    "call", "_sequence", "iteratorValue", "iterator", "class", "SkipSequence", "is", "Sequence",
    "construct", "new", "sequence", "count", "_sequence", "sequence", "_count", "count",
    "iterate", "iterator", "if", "iterator", "return", "_sequence", "iterate", "iterator",
    "else", "iterator", "_sequence", "iterate", "iterator", "var", "count", "_count", "while",
    "count", "iterator", "iterator", "_sequence", "iterate", "iterator", "count", "count", "return",
    "iterator", "iteratorValue", "iterator", "_sequence", "iteratorValue", "iterator", "class",
    "TakeSequence", "is", "Sequence", "construct", "new", "sequence", "count", "_sequence",
    "sequence", "_count", "count", "iterate", "iterator", "if", "iterator", "_taken", "else",
    "_taken", "_taken", "return", "_taken", "_count", "null", "_sequence", "iterate", "iterator",
    "iteratorValue", "iterator", "_sequence", "iteratorValue", "iterator", "class", "WhereSequence",
    "is", "Sequence", "construct", "new", "sequence", "fn", "_sequence", "sequence", "_fn", "fn",
    "iterate", "iterator", "while", "iterator", "_sequence", "iterate", "iterator", "if", "_fn",
    "call", "_sequence", "iteratorValue", "iterator", "break", "return", "iterator", "iteratorValue",
    "iterator", "_sequence", "iteratorValue", "iterator", "class", "String", "is", "Sequence",
    "bytes", "StringByteSequence", "new", "this", "codePoints", "StringCodePointSequence", "new",
    "this", "split", "delimiter", "if", "delimiter", "is", "String", "delimiter", "isEmpty",
    "Fiber", "abort", "var", "result", "var", "last", "var", "index", "var", "delimSize",
    "delimiter", "byteCount_", "var", "size", "byteCount_", "while", "last", "size", "index",
    "indexOf", "delimiter", "last", "result", "add", "this", "last", "index", "last", "index",
    "delimSize", "if", "last", "size", "result", "add", "this", "last", "else", "result", "add",
    "return", "result", "replace", "from", "to", "if", "from", "is", "String", "from", "isEmpty",
    "Fiber", "abort", "else", "if", "to", "is", "String", "Fiber", "abort", "var", "result", "var",
    "last", "var", "index", "var", "fromSize", "from", "byteCount_", "var", "size", "byteCount_",
    "while", "last", "size", "index", "indexOf", "from", "last", "result", "result", "this", "last",
    "index", "to", "last", "index", "fromSize", "if", "last", "size", "result", "result", "this",
    "last", "return", "result", "trim", "trim_", "true", "true", "trim", "chars", "trim_", "chars",
    "true", "true", "trimEnd", "trim_", "false", "true", "trimEnd", "chars", "trim_", "chars",
    "false", "true", "trimStart", "trim_", "true", "false", "trimStart", "chars", "trim_", "chars",
    "true", "false", "trim_", "chars", "trimStart", "trimEnd", "if", "chars", "is", "String",
    "Fiber", "abort", "var", "codePoints", "chars", "codePoints", "toList", "var", "start", "if",
    "trimStart", "while", "start", "iterate", "start", "if", "codePoints", "contains",
    "codePointAt_", "start", "break", "if", "start", "false", "return", "else", "start", "var",
    "end", "if", "trimEnd", "end", "byteCount_", "while", "end", "start", "var", "codePoint",
    "codePointAt_", "end", "if", "codePoint", "codePoints", "contains", "codePoint", "break",
    "end", "end", "if", "end", "start", "return", "else", "end", "return", "this", "start", "end",
    "count", "if", "count", "is", "Num", "count", "isInteger", "count", "Fiber", "abort", "var",
    "result", "for", "i", "in", "count", "result", "result", "this", "return", "result", "class",
    "StringByteSequence", "is", "Sequence", "construct", "new", "string", "_string", "string",
    "index", "_string", "byteAt_", "index", "iterate", "iterator", "_string", "iterateByte_",
    "iterator", "iteratorValue", "iterator", "_string", "byteAt_", "iterator", "count", "_string",
    "byteCount_", "class", "StringCodePointSequence", "is", "Sequence", "construct", "new",
    "string", "_string", "string", "index", "_string", "codePointAt_", "index", "iterate",
    "iterator", "_string", "iterate", "iterator", "iteratorValue", "iterator", "_string",
    "codePointAt_", "iterator", "count", "_string", "count", "class", "List", "is", "Sequence",
    "addAll", "other", "for", "element", "in", "other", "add", "element", "return", "other",
    "toString", "join", "other", "var", "result", "this", "for", "element", "in", "other", "result",
    "add", "element", "return", "result", "count", "if", "count", "is", "Num", "count",
    "isInteger", "count", "Fiber", "abort", "var", "result", "for", "i", "in", "count", "result",
    "addAll", "this", "return", "result", "class", "Map", "is", "Sequence", "keys",
    "MapKeySequence", "new", "this", "values", "MapValueSequence", "new", "this", "toString", "var",
    "first", "true", "var", "result", "for", "key", "in", "keys", "if", "first", "result", "result",
    "first", "false", "result", "result", "key", "this", "key", "return", "result", "iteratorValue",
    "iterator", "return", "MapEntry", "new", "keyIteratorValue_", "iterator", "valueIteratorValue_",
    "iterator", "class", "MapEntry", "construct", "new", "key", "value", "_key", "key", "_value",
    "value", "key", "_key", "value", "_value", "toString", "_key", "_value", "class",
    "MapKeySequence", "is", "Sequence", "construct", "new", "map", "_map", "map", "iterate", "n",
    "_map", "iterate", "n", "iteratorValue", "iterator", "_map", "keyIteratorValue_", "iterator",
    "class", "MapValueSequence", "is", "Sequence", "construct", "new", "map", "_map", "map",
    "iterate", "n", "_map", "iterate", "n", "iteratorValue", "iterator", "_map",
    "valueIteratorValue_", "iterator", "class", "Range", "is", "Sequence", "class", "System",
    "static", "print", "writeString_", "static", "print", "obj", "writeObject_", "obj",
    "writeString_", "return", "obj", "static", "printAll", "sequence", "for", "object", "in",
    "sequence", "writeObject_", "object", "writeString_", "static", "write", "obj", "writeObject_",
    "obj", "return", "obj", "static", "writeAll", "sequence", "for", "object", "in", "sequence",
    "writeObject_", "object", "static", "writeObject_", "obj", "var", "string", "obj", "toString",
    "if", "string", "is", "String", "writeString_", "string", "else", "writeString_",
};

static void SimpleLookupBench(benchmark::State &state)
{
  for (auto _ : state)
  {
    for (auto i : x)
    {
      TokenType type = SimpleLookup(i.c_str(), i.size());
      benchmark::DoNotOptimize(type);
    }
  }
}
BENCHMARK(SimpleLookupBench);

static void TrieBench(benchmark::State &state)
{
  for (auto _ : state)
  {
    for (auto i : x)
    {
      TokenType type = Trie(i.c_str(), i.size());
      benchmark::DoNotOptimize(type);
    }
  }
}
BENCHMARK(TrieBench);

/* C++ code produced by gperf version 3.1 */
/* Command-line: gperf gperf-wren.txt  */
/* Computed positions: -k'2' */

#if !((' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126))
/* The character set is not based on ISO-646.  */
#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gperf@gnu.org>."
#endif

struct PerfectKeywordHashTableEntry
{
  const char *name;
  TokenType value;
};
enum
{
  TOTAL_KEYWORDS = 19,
  MIN_WORD_LENGTH = 2,
  MAX_WORD_LENGTH = 9,
  MIN_HASH_VALUE = 2,
  MAX_HASH_VALUE = 25
};

/* maximum key range = 24, duplicates = 0 */

class PerfectKeywordHash
{
private:
  static inline unsigned int CalculatePerfectHash(const char *str, size_t len);

public:
  static TokenType PerfectHash(const char *str, size_t len);
};

inline unsigned int
PerfectKeywordHash::CalculatePerfectHash(const char *str, size_t len)
{
  static const unsigned char asso_values[] = {
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 5, 26, 26,
    26, 10, 15, 26, 20, 26, 26, 26, 15, 5,
    10, 0, 26, 26, 10, 0, 0, 0, 26, 26,
    26, 26, 26, 26, 26, 26, 26, 26
  };
  return len + asso_values[static_cast<unsigned char>(str[1])];
}

static const unsigned char PerfectKeywordLengthTable[] = {
  0, 0, 2, 3, 4, 5, 6, 7, 3, 9, 5, 6, 2, 0, 4, 5, 6, 2, 0, 4, 5, 0, 0, 0, 4, 5
};

static const struct PerfectKeywordHashTableEntry PerfectKeywordHashTable[] = {
  {"", TOKEN_NAME}, {"", TOKEN_NAME},
  {"is", TOKEN_IS},
  {"for", TOKEN_FOR},
  {"null", TOKEN_NULL},
  {"super", TOKEN_SUPER},
  {"static", TOKEN_STATIC},
  {"foreign", TOKEN_FOREIGN},
  {"var", TOKEN_VAR},
  {"construct", TOKEN_CONSTRUCT},
  {"false", TOKEN_FALSE},
  {"import", TOKEN_IMPORT},
  {"in", TOKEN_IN},
  {"", TOKEN_NAME},
  {"true", TOKEN_TRUE},
  {"break", TOKEN_BREAK},
  {"return", TOKEN_RETURN},
  {"if", TOKEN_IF},
  {"", TOKEN_NAME},
  {"else", TOKEN_ELSE},
  {"class", TOKEN_CLASS},
  {"", TOKEN_NAME},
  {"", TOKEN_NAME},
  {"", TOKEN_NAME},
  {"this", TOKEN_THIS},
  {"while", TOKEN_WHILE}
};

TokenType PerfectKeywordHash::PerfectHash(const char *str, size_t len)
{
  if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
  {
    unsigned int key = CalculatePerfectHash(str, len);

    if (key <= MAX_HASH_VALUE)
      if (len == PerfectKeywordLengthTable[key])
      {
        const char *s = PerfectKeywordHashTable[key].name;

        if (*str == *s && !memcmp(str + 1, s + 1, len - 1))
          return PerfectKeywordHashTable[key].value;
      }
  }
  return TOKEN_NAME;
}

static void PerfectHashBench(benchmark::State &state)
{
  for (auto _ : state)
  {
    for (auto i : x)
    {
      TokenType type = PerfectKeywordHash::PerfectHash(i.c_str(), i.size());
      benchmark::DoNotOptimize(type);
    }
  }
}
BENCHMARK(PerfectHashBench);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment