Skip to content

Instantly share code, notes, and snippets.

@VideoCarp
Last active May 8, 2022 03:33
Show Gist options
  • Save VideoCarp/71eeb6cabadc6f450ede32cd6ebf21c2 to your computer and use it in GitHub Desktop.
Save VideoCarp/71eeb6cabadc6f450ede32cd6ebf21c2 to your computer and use it in GitHub Desktop.
A lexer for the English language written in F#.
open System
open System.Text
// Helper functions (SEE End of file for details)
let numeric c = (c >= '0' && c <= '9')
let ofword c =
(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c = '\'')
let str c = (c = '"') // fix highlighting syntax "
let notstr c = (c <> '"') // "
// Lexer
let rec lexer (input: string) =
let len = input.Length
let rec lex start =
if start >= len then []
else
let grapheme = input.[start]
let inline atk tk = tk :: lex (start + 1) // addtoken
let inline abtk cond tag skip = // addbigtoken
let rec findend i =
if (i < len) && (cond(input.[i])) then findend (i + 1)
else i - 1
let tkstart = (start + skip)
let tkend = findend tkstart
(input.[tkstart..tkend], tag) :: lex (tkend + skip + 1)
match grapheme with
| '(' -> atk ("(", "oparen")
| ')' -> atk (")", "cparen")
| '.' -> atk (".", "stop")
| ',' -> atk (",", "comma")
| ';' -> atk (";", "semicolon")
| ':' -> atk (":", "colon")
| '=' -> atk ("=", "equals")
| '-' -> atk ("-", "en dash")
| '—' -> atk ("—", "em dash")
| '!' -> atk ("!", "exclamation")
| '?' -> atk ("?", "question")
| '\'' -> atk ("'", "apostrophe")
| '…' -> atk ("…", "ellipsis")
| _ when ofword grapheme -> abtk ofword "word" 0
| _ when numeric grapheme -> abtk numeric "number" 0
| _ when str grapheme -> abtk notstr "quote" 1
| _ -> lex (start + 1)
lex 0
// Test
printf "Program: "
let program = (Console.ReadLine() |> IO.File.ReadAllText)
let tokens = lexer program
tokens |> Seq.iter (printfn "%A")
(*
----***********************************************************************************************
* This is a lexical analyser written by hand in F#, designed to analyse the English language.
* It cannot handle all the features of the language, but it can handle most of them.
* It can be utilised as a basis for basic natural language processing and as a
* template for lexical analysers.
* Benchmarks show it's pretty fast. It took me 48 ms on average to process 100,000 characters
* that total up to 25,000 strings. I ran the test 100 times using Diagnostics.Stopwatch.
----***********************************************************************************************
*)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment