-
-
Save sheredom/cc44e5f1f9c1c4d807d49819b01685f7 to your computer and use it in GitHub Desktop.
Branchless [[clang::musttail]] tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Creative Commons Legal Code | |
| // | |
| // CC0 1.0 Universal | |
| // | |
| // CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE | |
| // LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN | |
| // ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS | |
| // INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES | |
| // REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS | |
| // PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM | |
| // THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED | |
| // HEREUNDER. | |
| // | |
| // Statement of Purpose | |
| // | |
| // The laws of most jurisdictions throughout the world automatically confer | |
| // exclusive Copyright and Related Rights (defined below) upon the creator | |
| // and subsequent owner(s) (each and all, an "owner") of an original work of | |
| // authorship and/or a database (each, a "Work"). | |
| // | |
| // Certain owners wish to permanently relinquish those rights to a Work for | |
| // the purpose of contributing to a commons of creative, cultural and | |
| // scientific works ("Commons") that the public can reliably and without fear | |
| // of later claims of infringement build upon, modify, incorporate in other | |
| // works, reuse and redistribute as freely as possible in any form whatsoever | |
| // and for any purposes, including without limitation commercial purposes. | |
| // These owners may contribute to the Commons to promote the ideal of a free | |
| // culture and the further production of creative, cultural and scientific | |
| // works, or to gain reputation or greater distribution for their Work in | |
| // part through the use and efforts of others. | |
| // | |
| // For these and/or other purposes and motivations, and without any | |
| // expectation of additional consideration or compensation, the person | |
| // associating CC0 with a Work (the "Affirmer"), to the extent that he or she | |
| // is an owner of Copyright and Related Rights in the Work, voluntarily | |
| // elects to apply CC0 to the Work and publicly distribute the Work under its | |
| // terms, with knowledge of his or her Copyright and Related Rights in the | |
| // Work and the meaning and intended legal effect of CC0 on those rights. | |
| // | |
| // 1. Copyright and Related Rights. A Work made available under CC0 may be | |
| // protected by copyright and related or neighboring rights ("Copyright and | |
| // Related Rights"). Copyright and Related Rights include, but are not | |
| // limited to, the following: | |
| // | |
| // i. the right to reproduce, adapt, distribute, perform, display, | |
| // communicate, and translate a Work; | |
| // ii. moral rights retained by the original author(s) and/or performer(s); | |
| // iii. publicity and privacy rights pertaining to a person's image or | |
| // likeness depicted in a Work; | |
| // iv. rights protecting against unfair competition in regards to a Work, | |
| // subject to the limitations in paragraph 4(a), below; | |
| // v. rights protecting the extraction, dissemination, use and reuse of data | |
| // in a Work; | |
| // vi. database rights (such as those arising under Directive 96/9/EC of the | |
| // European Parliament and of the Council of 11 March 1996 on the legal | |
| // protection of databases, and under any national implementation | |
| // thereof, including any amended or successor version of such | |
| // directive); and | |
| // vii. other similar, equivalent or corresponding rights throughout the | |
| // world based on applicable law or treaty, and any national | |
| // implementations thereof. | |
| // | |
| // 2. Waiver. To the greatest extent permitted by, but not in contravention | |
| // of, applicable law, Affirmer hereby overtly, fully, permanently, | |
| // irrevocably and unconditionally waives, abandons, and surrenders all of | |
| // Affirmer's Copyright and Related Rights and associated claims and causes | |
| // of action, whether now known or unknown (including existing as well as | |
| // future claims and causes of action), in the Work (i) in all territories | |
| // worldwide, (ii) for the maximum duration provided by applicable law or | |
| // treaty (including future time extensions), (iii) in any current or future | |
| // medium and for any number of copies, and (iv) for any purpose whatsoever, | |
| // including without limitation commercial, advertising or promotional | |
| // purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each | |
| // member of the public at large and to the detriment of Affirmer's heirs and | |
| // successors, fully intending that such Waiver shall not be subject to | |
| // revocation, rescission, cancellation, termination, or any other legal or | |
| // equitable action to disrupt the quiet enjoyment of the Work by the public | |
| // as contemplated by Affirmer's express Statement of Purpose. | |
| // | |
| // 3. Public License Fallback. Should any part of the Waiver for any reason | |
| // be judged legally invalid or ineffective under applicable law, then the | |
| // Waiver shall be preserved to the maximum extent permitted taking into | |
| // account Affirmer's express Statement of Purpose. In addition, to the | |
| // extent the Waiver is so judged Affirmer hereby grants to each affected | |
| // person a royalty-free, non transferable, non sublicensable, non exclusive, | |
| // irrevocable and unconditional license to exercise Affirmer's Copyright and | |
| // Related Rights in the Work (i) in all territories worldwide, (ii) for the | |
| // maximum duration provided by applicable law or treaty (including future | |
| // time extensions), (iii) in any current or future medium and for any number | |
| // of copies, and (iv) for any purpose whatsoever, including without | |
| // limitation commercial, advertising or promotional purposes (the | |
| // "License"). The License shall be deemed effective as of the date CC0 was | |
| // applied by Affirmer to the Work. Should any part of the License for any | |
| // reason be judged legally invalid or ineffective under applicable law, such | |
| // partial invalidity or ineffectiveness shall not invalidate the remainder | |
| // of the License, and in such case Affirmer hereby affirms that he or she | |
| // will not (i) exercise any of his or her remaining Copyright and Related | |
| // Rights in the Work or (ii) assert any associated claims and causes of | |
| // action with respect to the Work, in either case contrary to Affirmer's | |
| // express Statement of Purpose. | |
| // | |
| // 4. Limitations and Disclaimers. | |
| // | |
| // a. No trademark or patent rights held by Affirmer are waived, abandoned, | |
| // surrendered, licensed or otherwise affected by this document. | |
| // b. Affirmer offers the Work as-is and makes no representations or | |
| // warranties of any kind concerning the Work, express, implied, | |
| // statutory or otherwise, including without limitation warranties of | |
| // title, merchantability, fitness for a particular purpose, non | |
| // infringement, or the absence of latent or other defects, accuracy, or | |
| // the present or absence of errors, whether or not discoverable, all to | |
| // the greatest extent permissible under applicable law. | |
| // c. Affirmer disclaims responsibility for clearing rights of other persons | |
| // that may apply to the Work or any use thereof, including without | |
| // limitation any person's Copyright and Related Rights in the Work. | |
| // Further, Affirmer disclaims responsibility for obtaining any necessary | |
| // consents, permissions or other rights required for any use of the | |
| // Work. | |
| // d. Affirmer understands and acknowledges that Creative Commons is not a | |
| // party to this document and has no duty or obligation with respect to | |
| // this CC0 or use of the Work. | |
| // tokenizer.h | |
| #pragma once | |
| #include <stdint.h> | |
| #ifdef __cplusplus | |
| extern "C" { | |
| #endif | |
| typedef void *tokenizer_t; | |
| typedef enum token_type_e { | |
| token_type_end, | |
| token_type_comment, | |
| token_type_and, | |
| token_type_or, | |
| token_type_xor, | |
| token_type_not, | |
| token_type_add, | |
| token_type_sub, | |
| token_type_mul, | |
| token_type_div, | |
| token_type_mod, | |
| token_type_lcurly, | |
| token_type_rcurly, | |
| token_type_lparen, | |
| token_type_rparen, | |
| token_type_larray, | |
| token_type_rarray, | |
| token_type_comma, | |
| token_type_dot, | |
| token_type_codepoint, | |
| token_type_string, | |
| token_type_colon, | |
| token_type_semicolon, | |
| token_type_at, | |
| token_type_eq, | |
| token_type_ceq, | |
| token_type_neq, | |
| token_type_clt, | |
| token_type_cgt, | |
| token_type_cle, | |
| token_type_cge, | |
| token_type_lsh, | |
| token_type_rsh, | |
| token_type_and_eq, | |
| token_type_or_eq, | |
| token_type_xor_eq, | |
| token_type_not_eq, | |
| token_type_add_eq, | |
| token_type_sub_eq, | |
| token_type_mul_eq, | |
| token_type_div_eq, | |
| token_type_mod_eq, | |
| token_type_lsh_eq, | |
| token_type_rsh_eq, | |
| token_type_if, | |
| token_type_for, | |
| token_type_loop, | |
| token_type_int, | |
| token_type_rat, | |
| token_type_ident, | |
| token_type_nil, | |
| token_type_unhandled, | |
| } token_type_t; | |
| typedef struct token_s { | |
| token_type_t type; | |
| uint32_t offset; | |
| uint32_t length; | |
| uint32_t line; | |
| uint32_t column; | |
| } token_t; | |
| int tokenizer_create(const char *data, tokenizer_t *out_tokenizer); | |
| int tokenizer_next(tokenizer_t tokenizer, token_t *out_token); | |
| int tokenizer_destroy(tokenizer_t tokenizer); | |
| #ifdef __cplusplus | |
| } // extern "C" | |
| #endif | |
| // tokenizer.cpp | |
| #include "tokenizer.h" | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <atomic> | |
| struct tokenizer_s { | |
| const uint8_t *data; | |
| uint32_t current; | |
| uint32_t line; | |
| uint32_t column; | |
| void skip_current() { | |
| current += 1; | |
| column += 1; | |
| } | |
| }; | |
| static int unhandled(tokenizer_s *const, token_t *out_token) { | |
| out_token->type = token_type_unhandled; | |
| return -1; | |
| } | |
| template <char L, char H> static constexpr bool in_range(const char c) { | |
| static_assert(L <= H); | |
| return L <= c && c <= H; | |
| } | |
| static constexpr bool is_ident_leading_char(const char c) { | |
| return ('_' == c) || in_range<'a', 'z'>(c) || in_range<'A', 'Z'>(c); | |
| } | |
| static constexpr bool is_ident_char(const char c) { | |
| return is_ident_leading_char(c) || in_range<'0', '9'>(c); | |
| } | |
| template <typename T> struct tokenizer_jump_table_s { | |
| using Super = tokenizer_jump_table_s<T>; | |
| using JumpTableFuncType = int (*)(tokenizer_s *const, token_t *); | |
| static const T &singleton() { | |
| static const T t; | |
| return t; | |
| } | |
| explicit constexpr tokenizer_jump_table_s(JumpTableFuncType splat_func) { | |
| for (unsigned i = 0; i < 256; i++) { | |
| jt[i] = splat_func; | |
| } | |
| } | |
| JumpTableFuncType jt[256]; | |
| }; | |
| static int return_zero(tokenizer_s *const tokenizer, token_t *out_token) { | |
| return 0; | |
| } | |
| template <token_type_t TYPE> | |
| struct maybe_assignment_jump_table_s final | |
| : public tokenizer_jump_table_s<maybe_assignment_jump_table_s<TYPE>> { | |
| static int equals(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = TYPE; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| return 0; | |
| } | |
| constexpr maybe_assignment_jump_table_s() | |
| : tokenizer_jump_table_s<maybe_assignment_jump_table_s<TYPE>>( | |
| &return_zero) { | |
| this->jt['='] = = | |
| } | |
| }; | |
| struct rsh_jump_table_s final | |
| : public tokenizer_jump_table_s<rsh_jump_table_s> { | |
| static int equals(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_rsh_eq; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| return 0; | |
| } | |
| constexpr rsh_jump_table_s() : Super(&return_zero) { jt['='] = = } | |
| }; | |
| struct gt_jump_table_s final : public tokenizer_jump_table_s<gt_jump_table_s> { | |
| static int equals(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_cge; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| return 0; | |
| } | |
| static int rsh(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_rsh; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return rsh_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr gt_jump_table_s() : Super(&return_zero) { | |
| jt['='] = = | |
| jt['>'] = ↱ | |
| } | |
| }; | |
| struct lsh_jump_table_s final | |
| : public tokenizer_jump_table_s<lsh_jump_table_s> { | |
| static int equals(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_lsh_eq; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| return 0; | |
| } | |
| constexpr lsh_jump_table_s() : Super(&return_zero) { jt['='] = = } | |
| }; | |
| struct lt_jump_table_s final : public tokenizer_jump_table_s<lt_jump_table_s> { | |
| static int equals(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_cle; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| return 0; | |
| } | |
| static int not_equals(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_neq; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| return 0; | |
| } | |
| static int lsh(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_lsh; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return lsh_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr lt_jump_table_s() : Super(&return_zero) { | |
| jt['='] = = | |
| jt['>'] = ¬_equals; | |
| jt['<'] = ↰ | |
| } | |
| }; | |
| struct eq_jump_table_s final : public tokenizer_jump_table_s<eq_jump_table_s> { | |
| static int equals(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_ceq; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| return 0; | |
| } | |
| constexpr eq_jump_table_s() : Super(&return_zero) { jt['='] = = } | |
| }; | |
| struct comment_jump_table_s final | |
| : public tokenizer_jump_table_s<comment_jump_table_s> { | |
| static int newline(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->current += 1; | |
| tokenizer->line += 1; | |
| tokenizer->column = 0; | |
| return 0; | |
| } | |
| static int keep_going(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return comment_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr comment_jump_table_s() : Super(&keep_going) { | |
| jt['\n'] = &newline; | |
| jt['\0'] = &return_zero; | |
| } | |
| }; | |
| struct string_jump_table_s final | |
| : public tokenizer_jump_table_s<string_jump_table_s> { | |
| static int double_quote(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| return 0; | |
| } | |
| static int keep_going(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return string_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr string_jump_table_s() : Super(&keep_going) { | |
| jt['\t'] = &unhandled; | |
| jt['\r'] = &unhandled; | |
| jt['\n'] = &unhandled; | |
| jt['\0'] = &unhandled; | |
| jt['"'] = &double_quote; | |
| } | |
| }; | |
| struct codepoint_jump_table_s final | |
| : public tokenizer_jump_table_s<codepoint_jump_table_s> { | |
| static int quote(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| return 0; | |
| } | |
| static int keep_going(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return codepoint_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr codepoint_jump_table_s() : Super(&keep_going) { | |
| jt['\t'] = &unhandled; | |
| jt['\r'] = &unhandled; | |
| jt['\n'] = &unhandled; | |
| jt['\0'] = &unhandled; | |
| jt['\''] = "e; | |
| } | |
| }; | |
| struct ident_jump_table_s final | |
| : public tokenizer_jump_table_s<ident_jump_table_s> { | |
| static int ident(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return ident_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr ident_jump_table_s() : Super(&return_zero) { | |
| for (unsigned i = 0; i < 256; i++) { | |
| if (is_ident_char(static_cast<char>(i))) { | |
| jt[i] = &ident; | |
| } | |
| } | |
| } | |
| }; | |
| static int convert_to_ident(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_ident; | |
| [[clang::musttail]] return ident_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| template <char C, token_type_t TY, typename T> | |
| struct maybe_string_jump_table_s final | |
| : public tokenizer_jump_table_s<maybe_string_jump_table_s<C, TY, T>> { | |
| static int special(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return tokenizer_jump_table_s<T>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr maybe_string_jump_table_s() | |
| : tokenizer_jump_table_s<maybe_string_jump_table_s<C, TY, T>>( | |
| &convert_to_ident) { | |
| this->jt[C] = &special; | |
| } | |
| }; | |
| struct maybe_string_end_jump_table_s final | |
| : public tokenizer_jump_table_s<maybe_string_end_jump_table_s> { | |
| constexpr maybe_string_end_jump_table_s() : Super(&return_zero) { | |
| for (unsigned i = 0; i < 256; i++) { | |
| if (is_ident_char(static_cast<char>(i))) { | |
| jt[i] = &convert_to_ident; | |
| } | |
| } | |
| } | |
| }; | |
| using nil_l_jump_table_s = | |
| maybe_string_jump_table_s<'l', token_type_nil, | |
| maybe_string_end_jump_table_s>; | |
| using nil_i_jump_table_s = | |
| maybe_string_jump_table_s<'i', token_type_ident, nil_l_jump_table_s>; | |
| using if_f_jump_table_s = | |
| maybe_string_jump_table_s<'f', token_type_if, | |
| maybe_string_end_jump_table_s>; | |
| using for_r_jump_table_s = | |
| maybe_string_jump_table_s<'r', token_type_for, | |
| maybe_string_end_jump_table_s>; | |
| using for_o_jump_table_s = | |
| maybe_string_jump_table_s<'o', token_type_ident, for_r_jump_table_s>; | |
| using loop_p_jump_table_s = | |
| maybe_string_jump_table_s<'p', token_type_loop, | |
| maybe_string_end_jump_table_s>; | |
| using loop_o2_jump_table_s = | |
| maybe_string_jump_table_s<'o', token_type_ident, loop_p_jump_table_s>; | |
| using loop_o1_jump_table_s = | |
| maybe_string_jump_table_s<'o', token_type_ident, loop_o2_jump_table_s>; | |
| template <bool FIRST = true> | |
| struct binary_jump_table_s final | |
| : public tokenizer_jump_table_s<binary_jump_table_s<FIRST>> { | |
| static int binary(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return binary_jump_table_s<false>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr binary_jump_table_s() | |
| : tokenizer_jump_table_s<binary_jump_table_s<FIRST>>( | |
| FIRST ? &unhandled : &return_zero) { | |
| this->jt['0'] = &binary; | |
| this->jt['1'] = &binary; | |
| } | |
| }; | |
| template <bool FIRST = true> | |
| struct hexadecimal_jump_table_s final | |
| : public tokenizer_jump_table_s<hexadecimal_jump_table_s<FIRST>> { | |
| static int hexadecimal(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return hexadecimal_jump_table_s<false>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr hexadecimal_jump_table_s() | |
| : tokenizer_jump_table_s<hexadecimal_jump_table_s<FIRST>>( | |
| FIRST ? &unhandled : &return_zero) { | |
| for (unsigned i = 0; i < 256; i++) { | |
| if (in_range<'0', '9'>(static_cast<char>(i)) || | |
| in_range<'a', 'f'>(static_cast<char>(i)) || | |
| in_range<'A', 'F'>(static_cast<char>(i))) { | |
| this->jt[i] = &hexadecimal; | |
| } | |
| } | |
| } | |
| }; | |
| template <bool FIRST = true, bool NEGATIVE = false> | |
| struct exponent_jump_table_s final | |
| : public tokenizer_jump_table_s<exponent_jump_table_s<FIRST, NEGATIVE>> { | |
| static int digit(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return exponent_jump_table_s<false, true>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int negative(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return exponent_jump_table_s<true, true>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr exponent_jump_table_s() | |
| : tokenizer_jump_table_s<exponent_jump_table_s<FIRST, NEGATIVE>>( | |
| FIRST ? &unhandled : &return_zero) { | |
| for (unsigned i = 0; i < 256; i++) { | |
| if (in_range<'0', '9'>(static_cast<char>(i))) { | |
| this->jt[i] = &digit; | |
| } | |
| } | |
| if constexpr (!NEGATIVE) { | |
| this->jt['-'] = &negative; | |
| } | |
| } | |
| }; | |
| template <bool FIRST = true> | |
| struct rational_jump_table_s final | |
| : public tokenizer_jump_table_s<rational_jump_table_s<FIRST>> { | |
| static int digit(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return rational_jump_table_s<false>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int exponent(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return exponent_jump_table_s<>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr rational_jump_table_s() | |
| : tokenizer_jump_table_s<rational_jump_table_s<FIRST>>( | |
| FIRST ? &unhandled : &return_zero) { | |
| for (unsigned i = 0; i < 256; i++) { | |
| if (in_range<'0', '9'>(static_cast<char>(i))) { | |
| this->jt[i] = &digit; | |
| } | |
| } | |
| if constexpr (!FIRST) { | |
| this->jt['e'] = &exponent; | |
| this->jt['E'] = &exponent; | |
| } | |
| } | |
| }; | |
| // Zero can only be alone, or before a `.`, `b`, `B`, `x`, `X`, `e`, or `E`. | |
| struct zero_jump_table_s final | |
| : public tokenizer_jump_table_s<zero_jump_table_s> { | |
| static int rational(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_rat; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return rational_jump_table_s<>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int binary(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return binary_jump_table_s<>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int hexadecimal(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return hexadecimal_jump_table_s<>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int exponent(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_rat; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return exponent_jump_table_s<>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr zero_jump_table_s() : Super(&return_zero) { | |
| jt['.'] = &rational; | |
| jt['b'] = &binary; | |
| jt['B'] = &binary; | |
| jt['x'] = &hexadecimal; | |
| jt['X'] = &hexadecimal; | |
| jt['e'] = &exponent; | |
| jt['E'] = &exponent; | |
| } | |
| }; | |
| struct int_jump_table_s final | |
| : public tokenizer_jump_table_s<int_jump_table_s> { | |
| static int digit(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return int_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int exponent(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_rat; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return exponent_jump_table_s<>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int rational(tokenizer_s *const tokenizer, token_t *out_token) { | |
| out_token->type = token_type_rat; | |
| tokenizer->skip_current(); | |
| out_token->length += 1; | |
| [[clang::musttail]] return rational_jump_table_s<>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| constexpr int_jump_table_s() : Super(&return_zero) { | |
| for (unsigned i = 0; i < 256; i++) { | |
| if (in_range<'0', '9'>(static_cast<char>(i))) { | |
| jt[i] = &digit; | |
| } | |
| } | |
| jt['.'] = &rational; | |
| jt['e'] = &exponent; | |
| jt['E'] = &exponent; | |
| } | |
| }; | |
| struct main_jump_table_s final | |
| : public tokenizer_jump_table_s<main_jump_table_s> { | |
| static int leading_char(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_ident, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return ident_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int one_to_nine(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_int, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return int_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int zero(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_int, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return zero_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int maybe_if(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_if, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return if_f_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int maybe_for(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_for, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return for_o_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int maybe_loop(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_loop, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return loop_o1_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int maybe_nil(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_nil, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return nil_i_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int greater(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_cgt, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return gt_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int less(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_clt, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return lt_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int equals(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_eq, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return eq_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int skip_one(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return main_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int newline(tokenizer_s *const tokenizer, token_t *out_token) { | |
| tokenizer->current += 1; | |
| tokenizer->line += 1; | |
| tokenizer->column = 0; | |
| [[clang::musttail]] return main_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int eof(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_end, tokenizer->current, 0, tokenizer->line, | |
| tokenizer->column}; | |
| return 0; | |
| } | |
| static int comment(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_comment, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return comment_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int string(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_string, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return string_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| static int codepoint(tokenizer_s *const tokenizer, token_t *out_token) { | |
| *out_token = {token_type_codepoint, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return codepoint_jump_table_s::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| template <token_type_t TYPE, token_type_t ASSIGNMENT_TYPE> | |
| static int single_char_token_maybe_assignment(tokenizer_s *const tokenizer, | |
| token_t *out_token) { | |
| *out_token = {TYPE, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| [[clang::musttail]] return maybe_assignment_jump_table_s< | |
| ASSIGNMENT_TYPE>::singleton() | |
| .jt[tokenizer->data[tokenizer->current]](tokenizer, out_token); | |
| } | |
| template <token_type_t TYPE> | |
| static int single_char_token(tokenizer_s *const tokenizer, | |
| token_t *out_token) { | |
| *out_token = {TYPE, tokenizer->current, 1, tokenizer->line, | |
| tokenizer->column}; | |
| tokenizer->skip_current(); | |
| return 0; | |
| } | |
| constexpr main_jump_table_s() : Super(&unhandled) { | |
| for (unsigned i = 0; i < 256; i++) { | |
| if (in_range<'1', '9'>(static_cast<char>(i))) { | |
| jt[i] = &one_to_nine; | |
| } else if (is_ident_leading_char(static_cast<char>(i))) { | |
| jt[i] = &leading_char; | |
| } | |
| } | |
| jt['0'] = &zero; | |
| jt['i'] = &maybe_if; | |
| jt['f'] = &maybe_for; | |
| jt['l'] = &maybe_loop; | |
| jt['n'] = &maybe_nil; | |
| jt['>'] = &greater; | |
| jt['<'] = &less; | |
| jt['='] = = | |
| jt[' '] = &skip_one; | |
| jt['\t'] = &skip_one; | |
| jt['\r'] = &skip_one; | |
| jt['\n'] = &newline; | |
| jt['\0'] = &eof; | |
| jt['#'] = &comment; | |
| jt['"'] = &string; | |
| jt['\''] = &codepoint; | |
| #define TEMPLATE_TOKEN(TYPE, TOKEN) \ | |
| jt[TOKEN] = &single_char_token_maybe_assignment<TYPE, TYPE##_eq> | |
| TEMPLATE_TOKEN(token_type_and, '&'); | |
| TEMPLATE_TOKEN(token_type_or, '|'); | |
| TEMPLATE_TOKEN(token_type_xor, '^'); | |
| TEMPLATE_TOKEN(token_type_not, '~'); | |
| TEMPLATE_TOKEN(token_type_add, '+'); | |
| TEMPLATE_TOKEN(token_type_sub, '-'); | |
| TEMPLATE_TOKEN(token_type_mul, '*'); | |
| TEMPLATE_TOKEN(token_type_div, '/'); | |
| TEMPLATE_TOKEN(token_type_mod, '%'); | |
| #undef TEMPLATE_TOKEN | |
| #define TEMPLATE_TOKEN(TYPE, TOKEN) jt[TOKEN] = &single_char_token<TYPE> | |
| TEMPLATE_TOKEN(token_type_lcurly, '{'); | |
| TEMPLATE_TOKEN(token_type_rcurly, '}'); | |
| TEMPLATE_TOKEN(token_type_lparen, '('); | |
| TEMPLATE_TOKEN(token_type_rparen, ')'); | |
| TEMPLATE_TOKEN(token_type_larray, '['); | |
| TEMPLATE_TOKEN(token_type_rarray, ']'); | |
| TEMPLATE_TOKEN(token_type_comma, ','); | |
| TEMPLATE_TOKEN(token_type_dot, '.'); | |
| TEMPLATE_TOKEN(token_type_colon, ':'); | |
| TEMPLATE_TOKEN(token_type_semicolon, ';'); | |
| TEMPLATE_TOKEN(token_type_at, '@'); | |
| #undef TEMPLATE_TOKEN | |
| } | |
| }; | |
| tokenizer_s g_cached_tokenizer; | |
| std::atomic_bool g_cached_available(true); | |
| int tokenizer_create(const char *const data, tokenizer_t *out_tokenizer) { | |
| if (nullptr == data) { | |
| return -1; | |
| } | |
| tokenizer_s *tokenizer = nullptr; | |
| if (g_cached_available.exchange(false)) { | |
| tokenizer = &g_cached_tokenizer; | |
| } else { | |
| // Slow path - allocate! | |
| tokenizer = reinterpret_cast<tokenizer_s *>(malloc(sizeof(tokenizer_s))); | |
| } | |
| memset(tokenizer, 0, sizeof(tokenizer_s)); | |
| tokenizer->data = reinterpret_cast<const uint8_t *>(data); | |
| tokenizer->current = 0; | |
| *out_tokenizer = tokenizer; | |
| return 0; | |
| } | |
| int tokenizer_next(tokenizer_t t, token_t *out_token) { | |
| tokenizer_s *const tokenizer = reinterpret_cast<tokenizer_s *>(t); | |
| return main_jump_table_s::singleton().jt[tokenizer->data[tokenizer->current]]( | |
| tokenizer, out_token); | |
| } | |
| int tokenizer_destroy(tokenizer_t tokenizer) { | |
| // Check if we are using the global cached tokenizer first. | |
| if (tokenizer == &g_cached_tokenizer) { | |
| g_cached_available = true; | |
| } else { | |
| // Slow path - free! | |
| free(tokenizer); | |
| } | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment