VideoCarp/ConciseLexer.fsx

## ConciseLexer.fsx
open System
open System.Text
// Helper functions (SEE End of file for details)
let numeric c = (c >= '0' && c <= '9')
let ofword c =
    (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c = '\'')
let str c = (c = '"') // fix highlighting syntax "
let notstr c = (c <> '"') // "


// Lexer
let rec lexer (input: string) =
    let len = input.Length
    let rec lex start =
        if start >= len then []
        else
            let grapheme = input.[start]
            let inline atk tk = tk :: lex (start + 1) // addtoken
            let inline abtk cond tag skip = // addbigtoken
                let rec findend i =
                    if (i < len) && (cond(input.[i])) then findend (i + 1)
                    else i - 1
                let tkstart = (start + skip)
                let tkend = findend tkstart
                (input.[tkstart..tkend], tag) :: lex (tkend + skip + 1)
            match grapheme with
            | '(' -> atk ("(", "oparen")
            | ')' -> atk (")", "cparen")
            | '.' -> atk (".", "stop")
            | ',' -> atk (",", "comma")
            | ';' -> atk (";", "semicolon")
            | ':' -> atk (":", "colon")
            | '=' -> atk ("=", "equals")
            | '-' -> atk ("-", "en dash")
            | '—' -> atk ("—", "em dash")
            | '!' -> atk ("!", "exclamation")
            | '?' -> atk ("?", "question")
            | '\'' -> atk ("'", "apostrophe")
            | '…' -> atk ("…", "ellipsis")
            | _ when ofword grapheme -> abtk ofword "word" 0
            | _ when numeric grapheme -> abtk numeric "number" 0
            | _ when str grapheme -> abtk notstr "quote" 1
            | _ -> lex (start + 1)
    lex 0
// Test
printf "Program: "
let program = (Console.ReadLine() |> IO.File.ReadAllText)
let tokens = lexer program
tokens |> Seq.iter (printfn "%A")
(*
----***********************************************************************************************
*   This is a lexical analyser written by hand in F#, designed to analyse the English language.
*   It cannot handle all the features of the language, but it can handle most of them.
*   It can be utilised as a basis for basic natural language processing and as a
*   template for lexical analysers.
*   Benchmarks show it's pretty fast. It took me 48 ms on average to process 100,000 characters
*   that total up to 25,000 strings. I ran the test 100 times using Diagnostics.Stopwatch.
----***********************************************************************************************
*)
	open System
	open System.Text
	// Helper functions (SEE End of file for details)
	let numeric c = (c >= '0' && c <= '9')
	let ofword c =
	(c >= 'a' && c <= 'z' \|\| c >= 'A' && c <= 'Z' \|\| c = '\'')
	let str c = (c = '"') // fix highlighting syntax "
	let notstr c = (c <> '"') // "


	// Lexer
	let rec lexer (input: string) =
	let len = input.Length
	let rec lex start =
	if start >= len then []
	else
	let grapheme = input.[start]
	let inline atk tk = tk :: lex (start + 1) // addtoken
	let inline abtk cond tag skip = // addbigtoken
	let rec findend i =
	if (i < len) && (cond(input.[i])) then findend (i + 1)
	else i - 1
	let tkstart = (start + skip)
	let tkend = findend tkstart
	(input.[tkstart..tkend], tag) :: lex (tkend + skip + 1)
	match grapheme with
	\| '(' -> atk ("(", "oparen")
	\| ')' -> atk (")", "cparen")
	\| '.' -> atk (".", "stop")
	\| ',' -> atk (",", "comma")
	\| ';' -> atk (";", "semicolon")
	\| ':' -> atk (":", "colon")
	\| '=' -> atk ("=", "equals")
	\| '-' -> atk ("-", "en dash")
	\| '—' -> atk ("—", "em dash")
	\| '!' -> atk ("!", "exclamation")
	\| '?' -> atk ("?", "question")
	\| '\'' -> atk ("'", "apostrophe")
	\| '…' -> atk ("…", "ellipsis")
	\| _ when ofword grapheme -> abtk ofword "word" 0
	\| _ when numeric grapheme -> abtk numeric "number" 0
	\| _ when str grapheme -> abtk notstr "quote" 1
	\| _ -> lex (start + 1)
	lex 0
	// Test
	printf "Program: "
	let program = (Console.ReadLine() \|> IO.File.ReadAllText)
	let tokens = lexer program
	tokens \|> Seq.iter (printfn "%A")
	(*
	----***********************************************************************************************
	* This is a lexical analyser written by hand in F#, designed to analyse the English language.
	* It cannot handle all the features of the language, but it can handle most of them.
	* It can be utilised as a basis for basic natural language processing and as a
	* template for lexical analysers.
	* Benchmarks show it's pretty fast. It took me 48 ms on average to process 100,000 characters
	* that total up to 25,000 strings. I ran the test 100 times using Diagnostics.Stopwatch.
	----***********************************************************************************************
	*)