Skip to content

Instantly share code, notes, and snippets.

@erodactyl
Created June 21, 2023 19:45
Show Gist options
  • Save erodactyl/1a331d281d84b506c813be3f75daaf82 to your computer and use it in GitHub Desktop.
Save erodactyl/1a331d281d84b506c813be3f75daaf82 to your computer and use it in GitHub Desktop.
A lexer in OCaml
type token =
| INT of int
| IDENTIFIER of string
| PLUS
| MINUS
| TIMES
| DIV
| LET
| ASSIGN
| EQ
| NOT_EQ
| BANG
| LT
| GT
| LTE
| GTE
| IF
| ELSE
| RETURN
| FUNCTION
| SEMICOLON
| COMMA
| LPAREN
| RPAREN
| LBRACE
| RBRACE
| EOF
let lex input =
let rec lex' input index =
(* End of input *)
if index >= String.length input then []
else
let get_int () =
let rec get_int' index' acc =
if index' >= String.length input then acc
else
match input.[index'] with
| '0' .. '9' as c -> get_int' (index' + 1) (acc ^ String.make 1 c)
| _ -> acc
in
(* Start from global `index`, which is the current index in the whole string *)
let num_str = get_int' index "" in
(INT (int_of_string num_str), String.length num_str)
in
let get_continuous_string () =
let rec get_continuous_string' index' acc =
if index' >= String.length input then acc
else
match input.[index'] with
| ('a' .. 'z' | 'A' .. 'Z' | '_') as c ->
get_continuous_string' (index' + 1) (acc ^ String.make 1 c)
(* can be a number if not the first index of this token; e.g. param1 *)
| '0' .. '9' as c when index' > index ->
get_continuous_string' (index' + 1) (acc ^ String.make 1 c)
| _ -> acc
in
let id_str = get_continuous_string' index "" in
match id_str with
| "fn" -> (FUNCTION, String.length id_str)
| "let" -> (LET, String.length id_str)
| "if" -> (IF, String.length id_str)
| "else" -> (ELSE, String.length id_str)
| "return" -> (RETURN, String.length id_str)
| _ -> (IDENTIFIER id_str, String.length id_str)
in
let get_comparator () =
let get_single_letter_symbol () =
match input.[index] with
| '=' -> (ASSIGN, 1)
| '!' -> (BANG, 1)
| '<' -> (LT, 1)
| '>' -> (GT, 1)
| _ as c ->
raise (Failure ("Unexpected character: " ^ String.make 1 c))
in
if index + 1 > String.length input then get_single_letter_symbol ()
else
let symbol = String.sub input index 2 in
match symbol with
| "==" -> (EQ, 2)
| "!=" -> (NOT_EQ, 2)
| "<=" -> (LTE, 2)
| ">=" -> (GTE, 2)
| _ -> get_single_letter_symbol ()
in
match input.[index] with
| '0' .. '9' ->
let tok, len = get_int () in
tok :: lex' input (index + len)
| 'a' .. 'z' | 'A' .. 'Z' | '_' ->
let tok, len = get_continuous_string () in
tok :: lex' input (index + len)
| '+' -> PLUS :: lex' input (index + 1)
| '-' -> MINUS :: lex' input (index + 1)
| '*' -> TIMES :: lex' input (index + 1)
| '/' -> DIV :: lex' input (index + 1)
| '=' | '<' | '>' | '!' ->
let tok, len = get_comparator () in
tok :: lex' input (index + len)
| '(' -> LPAREN :: lex' input (index + 1)
| ')' -> RPAREN :: lex' input (index + 1)
| '{' -> LBRACE :: lex' input (index + 1)
| '}' -> RBRACE :: lex' input (index + 1)
| ';' -> SEMICOLON :: lex' input (index + 1)
| ',' -> COMMA :: lex' input (index + 1)
| ' ' | '\t' | '\n' | '\r' -> lex' input (index + 1)
| '\000' -> [ EOF ]
| _ ->
raise
(Failure ("Unexpected character: " ^ String.make 1 input.[index]))
in
lex' input 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment