Created
June 21, 2023 19:45
-
-
Save erodactyl/1a331d281d84b506c813be3f75daaf82 to your computer and use it in GitHub Desktop.
A lexer in OCaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
type token = | |
| INT of int | |
| IDENTIFIER of string | |
| PLUS | |
| MINUS | |
| TIMES | |
| DIV | |
| LET | |
| ASSIGN | |
| EQ | |
| NOT_EQ | |
| BANG | |
| LT | |
| GT | |
| LTE | |
| GTE | |
| IF | |
| ELSE | |
| RETURN | |
| FUNCTION | |
| SEMICOLON | |
| COMMA | |
| LPAREN | |
| RPAREN | |
| LBRACE | |
| RBRACE | |
| EOF | |
let lex input = | |
let rec lex' input index = | |
(* End of input *) | |
if index >= String.length input then [] | |
else | |
let get_int () = | |
let rec get_int' index' acc = | |
if index' >= String.length input then acc | |
else | |
match input.[index'] with | |
| '0' .. '9' as c -> get_int' (index' + 1) (acc ^ String.make 1 c) | |
| _ -> acc | |
in | |
(* Start from global `index`, which is the current index in the whole string *) | |
let num_str = get_int' index "" in | |
(INT (int_of_string num_str), String.length num_str) | |
in | |
let get_continuous_string () = | |
let rec get_continuous_string' index' acc = | |
if index' >= String.length input then acc | |
else | |
match input.[index'] with | |
| ('a' .. 'z' | 'A' .. 'Z' | '_') as c -> | |
get_continuous_string' (index' + 1) (acc ^ String.make 1 c) | |
(* can be a number if not the first index of this token; e.g. param1 *) | |
| '0' .. '9' as c when index' > index -> | |
get_continuous_string' (index' + 1) (acc ^ String.make 1 c) | |
| _ -> acc | |
in | |
let id_str = get_continuous_string' index "" in | |
match id_str with | |
| "fn" -> (FUNCTION, String.length id_str) | |
| "let" -> (LET, String.length id_str) | |
| "if" -> (IF, String.length id_str) | |
| "else" -> (ELSE, String.length id_str) | |
| "return" -> (RETURN, String.length id_str) | |
| _ -> (IDENTIFIER id_str, String.length id_str) | |
in | |
let get_comparator () = | |
let get_single_letter_symbol () = | |
match input.[index] with | |
| '=' -> (ASSIGN, 1) | |
| '!' -> (BANG, 1) | |
| '<' -> (LT, 1) | |
| '>' -> (GT, 1) | |
| _ as c -> | |
raise (Failure ("Unexpected character: " ^ String.make 1 c)) | |
in | |
if index + 1 > String.length input then get_single_letter_symbol () | |
else | |
let symbol = String.sub input index 2 in | |
match symbol with | |
| "==" -> (EQ, 2) | |
| "!=" -> (NOT_EQ, 2) | |
| "<=" -> (LTE, 2) | |
| ">=" -> (GTE, 2) | |
| _ -> get_single_letter_symbol () | |
in | |
match input.[index] with | |
| '0' .. '9' -> | |
let tok, len = get_int () in | |
tok :: lex' input (index + len) | |
| 'a' .. 'z' | 'A' .. 'Z' | '_' -> | |
let tok, len = get_continuous_string () in | |
tok :: lex' input (index + len) | |
| '+' -> PLUS :: lex' input (index + 1) | |
| '-' -> MINUS :: lex' input (index + 1) | |
| '*' -> TIMES :: lex' input (index + 1) | |
| '/' -> DIV :: lex' input (index + 1) | |
| '=' | '<' | '>' | '!' -> | |
let tok, len = get_comparator () in | |
tok :: lex' input (index + len) | |
| '(' -> LPAREN :: lex' input (index + 1) | |
| ')' -> RPAREN :: lex' input (index + 1) | |
| '{' -> LBRACE :: lex' input (index + 1) | |
| '}' -> RBRACE :: lex' input (index + 1) | |
| ';' -> SEMICOLON :: lex' input (index + 1) | |
| ',' -> COMMA :: lex' input (index + 1) | |
| ' ' | '\t' | '\n' | '\r' -> lex' input (index + 1) | |
| '\000' -> [ EOF ] | |
| _ -> | |
raise | |
(Failure ("Unexpected character: " ^ String.make 1 input.[index])) | |
in | |
lex' input 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment