Skip to content

Instantly share code, notes, and snippets.

@eatnumber1
Created August 4, 2013 07:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eatnumber1/6149541 to your computer and use it in GitHub Desktop.
Save eatnumber1/6149541 to your computer and use it in GitHub Desktop.
SML Implementation of Regular Expressions
(* langfc-src/scanner-parser/langf-hand-scanner.sml
*
* COPYRIGHT (c) 2011-2013 Matthew Fluet (http://www.cs.rit.edu/~mtf)
* All rights reserved.
*
* Rochester Institute of Technology
* 4005-711
* Q20112,Q20122
*
* COPYRIGHT (c) 2009 Matthew Fluet (http://tti-c.org/fluet)
* All rights reserved.
*
* University of Chicago
* CMSC 22610
* Winter 2009
*
* Hand-written LangF scanner as a StringCvt.reader.
*)
structure Regex =
struct
datatype regex =
(* No support for matching the null string *)
LITERAL of char
| END
| CONCATENATION of regex list
| ALTERNATION of regex list
| STAR of regex
fun eval (pattern : regex)
(reader: (char, 'strm) StringCvt.reader)
(strm: 'strm)
: (string * 'strm) option =
let
fun nextChar (f : (char * 'strm) -> (string * 'strm) option)
: (string * 'strm) option =
Option.mapPartial f (reader strm)
fun eval' s p = eval p reader s
in
case pattern of
END => if isSome (reader strm) then NONE else SOME ("", strm)
| LITERAL (p) =>
nextChar (
fn (c, strm) => if c = p
then SOME (Char.toString c, strm)
else NONE
)
| CONCATENATION (nil) => SOME ("", strm)
| CONCATENATION (h::t) =>
let
fun combine ((str, strm) : (string * 'strm))
: (string * 'strm) option =
Option.compose ((
fn (str', strm) => (str ^ str', strm)
), (eval' strm)) (CONCATENATION (t))
in
Option.composePartial (combine, (eval' strm)) h
end
| ALTERNATION (pl) =>
let
val (matches : (string * 'strm) list) =
List.mapPartial (eval' strm) pl
in
foldr (
fn ((str, strm), b) =>
SOME (
case b of
NONE => (str, strm)
| SOME (str', strm') =>
if (size str) > (size str')
then (str, strm)
else (str', strm')
)
) NONE matches
end
| STAR (p) =>
let
val m = eval' strm p
in
if not (isSome m)
then SOME ("", strm)
else
let
val (str, strm') = valOf m
val (str', strm'') = valOf (eval' strm' pattern)
in
SOME (str ^ str', strm'')
end
end
end
fun literalList s =
let
fun literalList' (s : char list) : regex list =
case s of
nil => nil
| (h::t) => (LITERAL (h)) :: (literalList' t)
in
literalList' (String.explode s)
end
fun stringLiteral s = CONCATENATION (literalList s)
local
fun runTest ((name, pattern, str, expect)
: (string * regex * string * bool))
: string =
let
fun reader nil = NONE
| reader (h::t) = SOME (h, t)
val result = Option.map (fn (str, strm) => str) (
eval pattern reader (String.explode str)
)
val resultStr = if (isSome result) = expect then "pass" else "fail"
in
name ^ ": " ^ resultStr
end
val whitespace_r =
STAR (ALTERNATION [LITERAL #" ", LITERAL #"\t", LITERAL #"\n"])
val alphabet_r =
ALTERNATION (
literalList "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
)
val tests = [
("literal", LITERAL #"a", "a", true),
("concat", stringLiteral "and", "and", true),
("alternation 1", ALTERNATION (literalList "yx"), "y", true),
("alternation 2", ALTERNATION (literalList "yx"), "x", true),
("end", END, "", true),
("star", STAR (LITERAL #"a"), "aaaa", true),
(
"whitespace",
whitespace_r,
" " ^ (Char.toString #"\n") ^ (Char.toString #"\t"),
true
),
(
"and with whitespace",
CONCATENATION [whitespace_r, stringLiteral "and", whitespace_r, END],
" and",
true
),
(
"ambiguous",
CONCATENATION [
ALTERNATION [
stringLiteral "if",
stringLiteral "ifc"
],
END
],
"ifc",
true
),
(
"ambiguous 2",
CONCATENATION [
ALTERNATION [
stringLiteral "if",
stringLiteral "ifc"
],
END
],
"if",
true
),
(
"ambiguous 3",
CONCATENATION [
ALTERNATION [
stringLiteral "and",
STAR (alphabet_r)
],
END
],
"andothers",
true
)
]
in
val testResults = map (runTest) tests
end
end
structure LangFHandScanner: LANGF_HAND_SCANNER =
struct
local
type Pattern = Regex.regex * (string -> Tokens.token)
fun alwaysReturn tok _ = tok
fun atomFun tok str = (tok (Atom.atom str))
local
val lowerAlphaList = Regex.literalList "abcdefghijklmnopqrstuvwxyz"
val upperAlphaList = Regex.literalList "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
val alphaList = lowerAlphaList @ upperAlphaList
val numList = Regex.literalList "1234567890"
val identifierList =
alphaList @ numList @ (Regex.literalList "'_")
in
val lowerAlpha = Regex.ALTERNATION lowerAlphaList
val upperAlpha = Regex.ALTERNATION upperAlphaList
val alpha = Regex.ALTERNATION alphaList
val identifier = Regex.ALTERNATION identifierList
val number = Regex.ALTERNATION numList
end
val (patterns : Pattern list) = [
(Regex.CONCATENATION [
Regex.LITERAL #"'",
lowerAlpha,
Regex.STAR identifier
], (atomFun Tokens.TYVAR_NAME)),
(Regex.CONCATENATION [
upperAlpha,
Regex.STAR identifier
], (atomFun Tokens.CON_NAME)),
(Regex.CONCATENATION [
lowerAlpha,
Regex.STAR identifier
], (atomFun Tokens.VAR_NAME)),
(Regex.ALTERNATION [
Regex.CONCATENATION [
Regex.LITERAL #"~",
number,
Regex.STAR number
],
Regex.CONCATENATION [
number,
Regex.STAR number
]
(* TODO: Error handling here *)
], (fn x => Tokens.INTEGER (valOf (IntInf.fromString x)))),
(Regex.stringLiteral "and", (alwaysReturn Tokens.KW_and)),
(Regex.stringLiteral "andalso", (alwaysReturn Tokens.KW_andalso)),
(Regex.stringLiteral "case", (alwaysReturn Tokens.KW_case)),
(Regex.stringLiteral "datatype", (alwaysReturn Tokens.KW_datatype)),
(Regex.stringLiteral "else", (alwaysReturn Tokens.KW_else)),
(Regex.stringLiteral "end", (alwaysReturn Tokens.KW_end)),
(Regex.stringLiteral "fn", (alwaysReturn Tokens.KW_fn)),
(Regex.stringLiteral "fun", (alwaysReturn Tokens.KW_fun)),
(Regex.stringLiteral "if", (alwaysReturn Tokens.KW_if)),
(Regex.stringLiteral "in", (alwaysReturn Tokens.KW_in)),
(Regex.stringLiteral "let", (alwaysReturn Tokens.KW_let)),
(Regex.stringLiteral "of", (alwaysReturn Tokens.KW_of)),
(Regex.stringLiteral "orelse", (alwaysReturn Tokens.KW_orelse)),
(Regex.stringLiteral "then", (alwaysReturn Tokens.KW_then)),
(Regex.stringLiteral "type", (alwaysReturn Tokens.KW_type)),
(Regex.stringLiteral "val", (alwaysReturn Tokens.KW_val)),
(Regex.stringLiteral "+", (alwaysReturn Tokens.PLUS)),
(Regex.stringLiteral "-", (alwaysReturn Tokens.MINUS)),
(Regex.stringLiteral "*", (alwaysReturn Tokens.ASTERISK)),
(Regex.stringLiteral "/", (alwaysReturn Tokens.SLASH)),
(Regex.stringLiteral "%", (alwaysReturn Tokens.PERCENT)),
(Regex.stringLiteral "~", (alwaysReturn Tokens.TILDE)),
(Regex.stringLiteral "==", (alwaysReturn Tokens.EQEQ)),
(Regex.stringLiteral "<>", (alwaysReturn Tokens.LTGT)),
(Regex.stringLiteral "<=", (alwaysReturn Tokens.LTEQ)),
(Regex.stringLiteral "<", (alwaysReturn Tokens.LT)),
(Regex.stringLiteral ">=", (alwaysReturn Tokens.GTEQ)),
(Regex.stringLiteral ">", (alwaysReturn Tokens.GT)),
(Regex.stringLiteral "^", (alwaysReturn Tokens.CARET)),
(Regex.stringLiteral "#", (alwaysReturn Tokens.HASH)),
(Regex.stringLiteral "!", (alwaysReturn Tokens.BANG)),
(Regex.stringLiteral ":=", (alwaysReturn Tokens.EQ_COLON)),
(Regex.stringLiteral "(", (alwaysReturn Tokens.LPAREN)),
(Regex.stringLiteral ")", (alwaysReturn Tokens.RPAREN)),
(Regex.stringLiteral "[", (alwaysReturn Tokens.LSBRACK)),
(Regex.stringLiteral "]", (alwaysReturn Tokens.RSBRACK)),
(Regex.stringLiteral "{", (alwaysReturn Tokens.LCBRACK)),
(Regex.stringLiteral "}", (alwaysReturn Tokens.RCBRACK)),
(Regex.stringLiteral "->", (alwaysReturn Tokens.MINUS_ARROW)),
(Regex.stringLiteral "=>", (alwaysReturn Tokens.EQ_ARROW)),
(Regex.stringLiteral "=", (alwaysReturn Tokens.EQ)),
(Regex.stringLiteral ":", (alwaysReturn Tokens.COLON)),
(Regex.stringLiteral ",", (alwaysReturn Tokens.COMMA)),
(Regex.stringLiteral ";", (alwaysReturn Tokens.SEMI)),
(Regex.stringLiteral "|", (alwaysReturn Tokens.VBAR)),
(Regex.stringLiteral "_", (alwaysReturn Tokens.UNDERSCORE))
]
in
fun scan {reportError: string -> unit}
(charRdr: (char, 'strm) StringCvt.reader)
(strm: 'strm) : (Tokens.token * 'strm) option =
let
fun stripStream (strm : 'strm) =
let
fun skipComment (charRdr: (char, 'strm) StringCvt.reader)
(strm : 'strm)
: 'strm option =
let
fun isBeginComment (strm : 'strm) : bool =
let
val (isComment : bool option) = Option.join (Option.map (
fn (c, s) => Option.map (
fn (c', s') =>
if String.implode (c :: #" " :: c' :: nil) = "( *" then
true
else
false
) (charRdr s)
) (charRdr strm))
in
case isComment of
SOME (b : bool) => b
| NONE => false
end
fun commentScan (strm : 'strm) : 'strm =
let
fun consumeComment (strm : 'strm) : 'strm =
let
val (c, s) = valOf (charRdr strm)
val (c', s') = valOf (charRdr s)
in
if String.implode (c :: #" " :: c' :: nil) = "* )" then
s'
else if isBeginComment strm then
consumeComment (commentScan s')
else
consumeComment s
end
in
consumeComment strm
end
in
if not (isBeginComment strm) then
NONE
else
let
fun nextStrm strm =
let
val next = charRdr strm
in
case next of
NONE => raise Fail "WAT"
| SOME (_, s) => s
end
in
SOME (commentScan (nextStrm (nextStrm strm)))
end
end
val commentStrm =
let
fun wsCommentStrm strm =
let
val ws = StringCvt.skipWS charRdr strm
val s = skipComment charRdr ws
in
case s of
SOME (s') => wsCommentStrm s'
| NONE => ws
end
in
wsCommentStrm strm
end
in
commentStrm
end
fun scan' (strm: 'strm) : (Tokens.token * 'strm) option =
let
fun regexScan (strm : 'strm) =
let
fun longestMatch (strm: 'strm)
(patterns : Pattern list)
: (Tokens.token * 'strm) option =
let
fun longestMatch' ((regex, f), (lp : (string * (Tokens.token * 'strm)) option)) =
let
val (m : (string * 'strm) option) = Regex.eval regex charRdr strm
in
case (m, lp) of
(NONE, _) => lp
| (SOME (str, strm'), NONE) => SOME (str, (f str, strm'))
| (SOME (str, strm'), SOME (str', _)) =>
if (size str) > (size str')
then SOME (str, (f str, strm')) else lp
end
in
Option.map (#2) (foldr longestMatch' NONE patterns)
end
in
longestMatch strm patterns
end
fun stringScan (strm : 'strm) : (Tokens.token * 'strm) option =
let
val (_, strm'') = valOf (charRdr strm)
fun stringScan' (strm : 'strm) : (char list * 'strm) =
let
val (c, strm''') = valOf (charRdr strm)
fun parseEscape _ : (char * 'strm) =
let
val (c', strm'''') = valOf (charRdr strm''')
in
case c' of
#"a" => (#"\a", strm'''')
| #"b" => (#"\b", strm'''')
| #"f" => (#"\f", strm'''')
| #"n" => (#"\n", strm'''')
| #"r" => (#"\r", strm'''')
| #"t" => (#"\t", strm'''')
| #"v" => (#"\v", strm'''')
| #"\\" => (#"\\", strm'''')
| #"\"" => (#"\"", strm'''')
| _ =>
let
val (one, s) = valOf (charRdr strm''')
val (two, s') = valOf (charRdr s)
val (three, s'') = valOf (charRdr s')
val str = String.implode (one :: two :: three :: nil)
val num = valOf (Int.fromString str)
in
(Char.chr num, s'')
end
end
in
if c = #"\\" then
let
val (c', s) = parseEscape ()
val (cl', s') = stringScan' s
in
(c' :: cl', s')
end
else if c = #"\"" then
(nil, strm''')
else
let
val (cl : char list, s : 'strm) = (stringScan' strm''')
in
(c :: cl, s)
end
end
val (chrs, strm''') = stringScan' strm''
in
SOME (Tokens.STRING (String.implode chrs), strm''')
end
val (scanners : (('strm -> bool) * ('strm -> (Tokens.token * 'strm) option)) list) = [
((fn s => not (isSome (charRdr s))), (fn _ => NONE)),
((
fn s =>
case (Option.map (#1) (charRdr s)) of
SOME (c) => c = #"\""
| NONE => false
), stringScan),
((fn _ => true), regexScan)
]
val (strippedStrm : 'strm) = stripStream strm
val scanner = List.find (fn (use, scanner) => use strippedStrm) scanners
in
case scanner of
NONE => raise Fail "No scanner found"
| SOME (scanner) =>
let
val ret = (#2 scanner) strippedStrm
in
ret
end
end
val (sc : (Tokens.token * 'strm) option option) =
SOME (scan' strm)
handle Option.Option => NONE
| Chr => (
reportError "digit too large";
NONE
)
exception EOFException
fun discardChar strm : 'strm =
let
val m = charRdr strm
in
case m of
SOME (c, s) => (
reportError ("bad character '" ^ (Char.toString c) ^ "'");
s
)
| NONE => raise EOFException
end
val scanRec = scan {reportError = reportError} charRdr
val (strippedStrm : 'strm) = stripStream strm
in
(
case sc of
SOME (sc) => (
case sc of
SOME (_) => sc
| NONE => scanRec (discardChar strippedStrm)
)
| NONE => scanRec (discardChar strippedStrm)
) handle EOFException => NONE
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment