Skip to content

Instantly share code, notes, and snippets.

@meijeru
Created June 25, 2011 10:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save meijeru/1046343 to your computer and use it in GitHub Desktop.
Save meijeru/1046343 to your computer and use it in GitHub Desktop.
Reds lexical grammar (is called by Reds lexer)
REBOL [
Title: "Grammar for Red/System lexical analysis"
Date: 1-Jul-2011
Name: "reds-lex-grammar"
Version: 1.0.0
File: %/G/Projects/Common/RED/red-system/sources/reds-lexer/reds-lex-grammar.r
Home: http://users.telenet.be/rwmeijer
Author: "Rudolf W. Meijer"
Rights: "Copyright (C) 2011 Rudolf W. Meijer. All Rights Reserved"
Content: {This grammar contains parse rules which will accept
lexically valid Red/System expressions, and actions
to store the parsed source as a REBOL data structure.
}
History: [
0.0.0 [19-Jun-2011 {Start of project} "RM"]
0.5.0 [24-Jun-2011 {First working version} "RM"]
0.7.0 [27-Jun-2011 {Added file! and tuple! literals} "RM"]
0.8.0 [27-Jun-2011 {Simplified the separator} "RM"]
0.8.5 [29-Jun-2011 {Added the % operator} "RM"]
0.9.0 [30-Jun-2011 {
Separator reduced to stripping comments only,
Grammar takes care of whitespace
} "RM" ] ]
1.0.0 [1-Jul-2011 {Grammar takes care of comments also} "RM"]
]
lex-grammar: context [
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; data structures
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
source: make block! 1000 ; will contain lexed source
level-stack: make block! 20 ; to keep track of nesting
insert/only level-stack source ; initialize
current-word: none ; last word parsed
current-integer: none ; last integer parsed
current-sel: none ; last path selector parsed
current-elem: none ; last element parsed
del: #"^(7F)"
char-table: reduce [
"null" null
"line" newline
"tab" tab
"page" newpage
"esc" escape
"back" backspace
"del" del
]
hex-digits: "0123456789ABCDEF"
ctrl-chars: "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]!_"
max-neg-int: to integer! #{80000000}
max-neg-str: "-2147483648"
max-pos-str: "2147483647"
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; action handlers
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
handle-elem: does [
; insert parsed element into source
if current-elem [
level-stack/1: insert/only tail level-stack/1 current-elem
current-elem: none ; avoid multiple handling
]
]
handle-block-start: has [source-ptr][
; insert new block into source
insert/only tail level-stack/1 source-ptr: make block! 20
insert/only level-stack source-ptr
]
handle-paren-start: has [source-ptr][
; insert new paren into source
insert/only tail level-stack/1 source-ptr: make paren! 20
insert/only level-stack source-ptr
]
store-word: func [s [string!]][
unless current-word: attempt [to word! s][
print ["error in word:" s]
]
]
handle-tuple: func [s [string!]][
unless current-elem: attempt [to tuple! s][
print ["error in tuple:" s]
]
]
store-integer: func [s [string!] /local res neg pow][
; must treat largest negative integer separately to avoid math overflow
if s = max-neg-str [current-integer: max-neg-int exit]
if neg: #"-" = first s [s: next s]
if any [
10 < length? s
all [10 = length? s max-pos-str < s]
][
print ["error in integer:" s]
current-integer: none exit
]
reverse s
res: 0 pow: 1
while [not tail? s][
res: (first s) - #"0" * pow + res
pow: pow * 10
s: next s
]
current-integer: either neg [negate res][res]
]
store-hex-integer: func [s [string!] /local res pos][
; s is guaranteed to have length 2 4 or 8
; and to consist of hex digits only
res: 0 pos: 4 * (length? s) - 8
while [not tail? s][
res: res or shift/left decode-hex-byte s pos
pos: pos - 8
s: at s 3
]
current-integer: res
]
decode-hex-byte: func [s [string!]][
16 * (index? find hex-digits first s)
+ (index? find hex-digits second s)
- 17 ; - 16 - 1 to account for 1-origin
]
decode-string: func [
; handle escaped quantities in character and string literals
; input is raw string, output is decoded string
; uses REBOL conventions, notably ^! = 1E (^^ doesn't work)
; parser has guaranteed legality of escapes
s [string!]
/local res c chunk chend
][
if 1 >= length? s [return s]
res: make string! length? s
while [not tail? s][
c: first s s: next s
either c = #"^^"
[
c: first s s: next s
either c = #"("
[
chunk: copy/part s chend: find s #")"
insert tail res either 2 = length? chunk
[
to char! decode-hex-byte chunk
][
select char-table chunk
]
s: next chend
][
insert tail res case [
c = #"/" [newline]
c = #"-" [tab]
c = #"~" [del]
chend: find ctrl-chars c [
to char! (index? chend) - 1
]
true [c]
]
]
][
insert tail res c
]
]
res
]
handle-file: func [s [string!]][
unless current-elem: attempt [to file! s][
print ["error in file:" s]
]
]
store-path: has [source-ptr][
; insert a block to collect head word and path selectors
insert/only tail level-stack/1 source-ptr: make block! 10
insert tail source-ptr current-word
insert/only level-stack source-ptr
]
handle-path: has [source-ptr][
; convert block into path (set-path handled at point of ":")
source-ptr: head level-stack/1
remove level-stack
remove level-stack/1
current-elem: to path! source-ptr
]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; grammar proper - order of rules and alternatives is important
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
program: [
any whitesp
some [
elem (handle-elem)
any whitesp
]
]
elem: [
"[" (handle-block-start)
any whitesp
any [ elem (handle-elem)
any whitesp
]
"]" (remove level-stack)
| "(" (handle-paren-start)
any whitesp
any [ elem (handle-elem)
any whitesp
]
")" (remove level-stack)
| path-expr ; handled directly
| hex-integer-literal (current-elem: current-integer)
| set-word-literal (current-elem: to set-word! current-word)
| refinement-literal (current-elem: to refinement! current-word)
| tuple-literal ; handled directly
| file-literal ; handled directly
| integer-literal (current-elem: current-integer)
| regular-word (current-elem: current-word)
| special-word (current-elem: current-word)
| get-word-literal (current-elem: to get-word! current-word)
| byte-literal ; handled directly
| issue-literal (current-elem: to issue! current-word)
| string-literal ; handled directly
]
path-expr: [
regular-word "/" (store-path)
sel-expr (level-stack/1: insert tail level-stack/1 current-sel)
any [ "/" sel-expr (level-stack/1: insert tail level-stack/1 current-sel)
] (handle-path)
opt [":" (current-elem: to set-path! current-elem)
]
]
sel-expr: [
hex-integer-literal (current-sel: current-integer)
| regular-word (current-sel: current-word)
| integer-literal (current-sel: current-integer)
]
hex-integer-literal: [
b: [ 8 hexchar | 4 hexchar | 2 hexchar ] e: "h"
(store-hex-integer copy/part b e)
]
set-word-literal: [ regular-word ":" ]
get-word-literal: [ ":" regular-word ]
refinement-literal: [ "/" regular-word ]
tuple-literal: [ b: [ some digit "." some digit some [ "." some digit ]] e:
(handle-tuple copy/part b e)
]
integer-literal: [ b: opt [ "+" | "-" ] some digit e:
(store-integer copy/part b e)
]
regular-word: [ b: wstart any wchar e:
(store-word copy/part b e)
]
special-word: [ b: ["<=" | "<>" | "<" | ">=" | ">" | "+" | "-" | "*" | "//" | "/"] e:
(store-word copy/part b e )
| "%" (store-word "///")
]
byte-literal: [ {#"} b: qchar e: {"}
(current-elem: first decode-string copy/part b e)
]
issue-literal: [ "#" regular-word ]
string-literal: [ [ {"} b: any qchar e: {"} | "{" b: any bchar e: "}" ]
(current-elem: decode-string copy/part b e)
]
file-literal: [ "%" b: some fchar e:
(handle-file copy/part b e)
]
; auxiliary definitions: character (sub-)sets
whitesp: [ #" " | #"^-" | #"^/" | #"^M" | #";" any non-lf #"^/" ]
ascii: charset [ #"^(00)" - #"^(FF)" ] ; full set
non-lf: exclude ascii charset "^/"
inqt: exclude ascii charset "^"^^^/^M" ; in " ", not escaped
inbr: exclude ascii charset "^}^^" ; in { }, not escaped
alpha: charset [ #"A" - #"Z" #"a" - #"z" ]
digit: charset [ #"0" - #"9" ]
alfanum: union alpha digit
hexchar: charset hex-digits
pchar: exclude ascii charset [ #"^(00)" - #"^(1F)" ] ; printable
qchar: [ inqt | echar ] ; in " " and #" "
bchar: [ inbr | echar ] ; in { }
echar: [ ; escaped char
"^^(" [ "null" | "line" | "tab" | "page" | "esc" | "back" | "del" ] ")"
| "^^(" 2 hexchar ")"
| "^^" pchar
]
fchar: union alfanum charset "-/."
wchar: union alfanum charset "!&'*+-.=?_`|~" ; in word
wstart: exclude wchar union digit charset "'" ; starting word
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment