Created
June 25, 2011 10:18
-
-
Save meijeru/1046343 to your computer and use it in GitHub Desktop.
Reds lexical grammar (is called by Reds lexer)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REBOL [ | |
Title: "Grammar for Red/System lexical analysis" | |
Date: 1-Jul-2011 | |
Name: "reds-lex-grammar" | |
Version: 1.0.0 | |
File: %/G/Projects/Common/RED/red-system/sources/reds-lexer/reds-lex-grammar.r | |
Home: http://users.telenet.be/rwmeijer | |
Author: "Rudolf W. Meijer" | |
Rights: "Copyright (C) 2011 Rudolf W. Meijer. All Rights Reserved" | |
Content: {This grammar contains parse rules which will accept | |
lexically valid Red/System expressions, and actions | |
to store the parsed source as a REBOL data structure. | |
} | |
History: [ | |
0.0.0 [19-Jun-2011 {Start of project} "RM"] | |
0.5.0 [24-Jun-2011 {First working version} "RM"] | |
0.7.0 [27-Jun-2011 {Added file! and tuple! literals} "RM"] | |
0.8.0 [27-Jun-2011 {Simplified the separator} "RM"] | |
0.8.5 [29-Jun-2011 {Added the % operator} "RM"] | |
0.9.0 [30-Jun-2011 { | |
Separator reduced to stripping comments only, | |
Grammar takes care of whitespace | |
} "RM" ] ] | |
1.0.0 [1-Jul-2011 {Grammar takes care of comments also} "RM"] | |
] | |
lex-grammar: context [ | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; data structures | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
source: make block! 1000 ; will contain lexed source | |
level-stack: make block! 20 ; to keep track of nesting | |
insert/only level-stack source ; initialize | |
current-word: none ; last word parsed | |
current-integer: none ; last integer parsed | |
current-sel: none ; last path selector parsed | |
current-elem: none ; last element parsed | |
del: #"^(7F)" | |
char-table: reduce [ | |
"null" null | |
"line" newline | |
"tab" tab | |
"page" newpage | |
"esc" escape | |
"back" backspace | |
"del" del | |
] | |
hex-digits: "0123456789ABCDEF" | |
ctrl-chars: "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]!_" | |
max-neg-int: to integer! #{80000000} | |
max-neg-str: "-2147483648" | |
max-pos-str: "2147483647" | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; action handlers | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
handle-elem: does [ | |
; insert parsed element into source | |
if current-elem [ | |
level-stack/1: insert/only tail level-stack/1 current-elem | |
current-elem: none ; avoid multiple handling | |
] | |
] | |
handle-block-start: has [source-ptr][ | |
; insert new block into source | |
insert/only tail level-stack/1 source-ptr: make block! 20 | |
insert/only level-stack source-ptr | |
] | |
handle-paren-start: has [source-ptr][ | |
; insert new paren into source | |
insert/only tail level-stack/1 source-ptr: make paren! 20 | |
insert/only level-stack source-ptr | |
] | |
store-word: func [s [string!]][ | |
unless current-word: attempt [to word! s][ | |
print ["error in word:" s] | |
] | |
] | |
handle-tuple: func [s [string!]][ | |
unless current-elem: attempt [to tuple! s][ | |
print ["error in tuple:" s] | |
] | |
] | |
store-integer: func [s [string!] /local res neg pow][ | |
; must treat largest negative integer separately to avoid math overflow | |
if s = max-neg-str [current-integer: max-neg-int exit] | |
if neg: #"-" = first s [s: next s] | |
if any [ | |
10 < length? s | |
all [10 = length? s max-pos-str < s] | |
][ | |
print ["error in integer:" s] | |
current-integer: none exit | |
] | |
reverse s | |
res: 0 pow: 1 | |
while [not tail? s][ | |
res: (first s) - #"0" * pow + res | |
pow: pow * 10 | |
s: next s | |
] | |
current-integer: either neg [negate res][res] | |
] | |
store-hex-integer: func [s [string!] /local res pos][ | |
; s is guaranteed to have length 2 4 or 8 | |
; and to consist of hex digits only | |
res: 0 pos: 4 * (length? s) - 8 | |
while [not tail? s][ | |
res: res or shift/left decode-hex-byte s pos | |
pos: pos - 8 | |
s: at s 3 | |
] | |
current-integer: res | |
] | |
decode-hex-byte: func [s [string!]][ | |
16 * (index? find hex-digits first s) | |
+ (index? find hex-digits second s) | |
- 17 ; - 16 - 1 to account for 1-origin | |
] | |
decode-string: func [ | |
; handle escaped quantities in character and string literals | |
; input is raw string, output is decoded string | |
; uses REBOL conventions, notably ^! = 1E (^^ doesn't work) | |
; parser has guaranteed legality of escapes | |
s [string!] | |
/local res c chunk chend | |
][ | |
if 1 >= length? s [return s] | |
res: make string! length? s | |
while [not tail? s][ | |
c: first s s: next s | |
either c = #"^^" | |
[ | |
c: first s s: next s | |
either c = #"(" | |
[ | |
chunk: copy/part s chend: find s #")" | |
insert tail res either 2 = length? chunk | |
[ | |
to char! decode-hex-byte chunk | |
][ | |
select char-table chunk | |
] | |
s: next chend | |
][ | |
insert tail res case [ | |
c = #"/" [newline] | |
c = #"-" [tab] | |
c = #"~" [del] | |
chend: find ctrl-chars c [ | |
to char! (index? chend) - 1 | |
] | |
true [c] | |
] | |
] | |
][ | |
insert tail res c | |
] | |
] | |
res | |
] | |
handle-file: func [s [string!]][ | |
unless current-elem: attempt [to file! s][ | |
print ["error in file:" s] | |
] | |
] | |
store-path: has [source-ptr][ | |
; insert a block to collect head word and path selectors | |
insert/only tail level-stack/1 source-ptr: make block! 10 | |
insert tail source-ptr current-word | |
insert/only level-stack source-ptr | |
] | |
handle-path: has [source-ptr][ | |
; convert block into path (set-path handled at point of ":") | |
source-ptr: head level-stack/1 | |
remove level-stack | |
remove level-stack/1 | |
current-elem: to path! source-ptr | |
] | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
; grammar proper - order of rules and alternatives is important | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
program: [ | |
any whitesp | |
some [ | |
elem (handle-elem) | |
any whitesp | |
] | |
] | |
elem: [ | |
"[" (handle-block-start) | |
any whitesp | |
any [ elem (handle-elem) | |
any whitesp | |
] | |
"]" (remove level-stack) | |
| "(" (handle-paren-start) | |
any whitesp | |
any [ elem (handle-elem) | |
any whitesp | |
] | |
")" (remove level-stack) | |
| path-expr ; handled directly | |
| hex-integer-literal (current-elem: current-integer) | |
| set-word-literal (current-elem: to set-word! current-word) | |
| refinement-literal (current-elem: to refinement! current-word) | |
| tuple-literal ; handled directly | |
| file-literal ; handled directly | |
| integer-literal (current-elem: current-integer) | |
| regular-word (current-elem: current-word) | |
| special-word (current-elem: current-word) | |
| get-word-literal (current-elem: to get-word! current-word) | |
| byte-literal ; handled directly | |
| issue-literal (current-elem: to issue! current-word) | |
| string-literal ; handled directly | |
] | |
path-expr: [ | |
regular-word "/" (store-path) | |
sel-expr (level-stack/1: insert tail level-stack/1 current-sel) | |
any [ "/" sel-expr (level-stack/1: insert tail level-stack/1 current-sel) | |
] (handle-path) | |
opt [":" (current-elem: to set-path! current-elem) | |
] | |
] | |
sel-expr: [ | |
hex-integer-literal (current-sel: current-integer) | |
| regular-word (current-sel: current-word) | |
| integer-literal (current-sel: current-integer) | |
] | |
hex-integer-literal: [ | |
b: [ 8 hexchar | 4 hexchar | 2 hexchar ] e: "h" | |
(store-hex-integer copy/part b e) | |
] | |
set-word-literal: [ regular-word ":" ] | |
get-word-literal: [ ":" regular-word ] | |
refinement-literal: [ "/" regular-word ] | |
tuple-literal: [ b: [ some digit "." some digit some [ "." some digit ]] e: | |
(handle-tuple copy/part b e) | |
] | |
integer-literal: [ b: opt [ "+" | "-" ] some digit e: | |
(store-integer copy/part b e) | |
] | |
regular-word: [ b: wstart any wchar e: | |
(store-word copy/part b e) | |
] | |
special-word: [ b: ["<=" | "<>" | "<" | ">=" | ">" | "+" | "-" | "*" | "//" | "/"] e: | |
(store-word copy/part b e ) | |
| "%" (store-word "///") | |
] | |
byte-literal: [ {#"} b: qchar e: {"} | |
(current-elem: first decode-string copy/part b e) | |
] | |
issue-literal: [ "#" regular-word ] | |
string-literal: [ [ {"} b: any qchar e: {"} | "{" b: any bchar e: "}" ] | |
(current-elem: decode-string copy/part b e) | |
] | |
file-literal: [ "%" b: some fchar e: | |
(handle-file copy/part b e) | |
] | |
; auxiliary definitions: character (sub-)sets | |
whitesp: [ #" " | #"^-" | #"^/" | #"^M" | #";" any non-lf #"^/" ] | |
ascii: charset [ #"^(00)" - #"^(FF)" ] ; full set | |
non-lf: exclude ascii charset "^/" | |
inqt: exclude ascii charset "^"^^^/^M" ; in " ", not escaped | |
inbr: exclude ascii charset "^}^^" ; in { }, not escaped | |
alpha: charset [ #"A" - #"Z" #"a" - #"z" ] | |
digit: charset [ #"0" - #"9" ] | |
alfanum: union alpha digit | |
hexchar: charset hex-digits | |
pchar: exclude ascii charset [ #"^(00)" - #"^(1F)" ] ; printable | |
qchar: [ inqt | echar ] ; in " " and #" " | |
bchar: [ inbr | echar ] ; in { } | |
echar: [ ; escaped char | |
"^^(" [ "null" | "line" | "tab" | "page" | "esc" | "back" | "del" ] ")" | |
| "^^(" 2 hexchar ")" | |
| "^^" pchar | |
] | |
fchar: union alfanum charset "-/." | |
wchar: union alfanum charset "!&'*+-.=?_`|~" ; in word | |
wstart: exclude wchar union digit charset "'" ; starting word | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment