meijeru/reds-lex-grammar.r

## reds-lex-grammar.r
REBOL [
	Title:		"Grammar for Red/System lexical analysis"
	Date:		1-Jul-2011
	Name:		"reds-lex-grammar"

	Version:	1.0.0
	File:		%/G/Projects/Common/RED/red-system/sources/reds-lexer/reds-lex-grammar.r
	Home:		http://users.telenet.be/rwmeijer

	Author:		"Rudolf W. Meijer"
	Rights:		"Copyright (C) 2011 Rudolf W. Meijer. All Rights Reserved"

	Content:	{This grammar contains parse rules which will accept
				 lexically valid Red/System expressions, and actions
				 to store the parsed source as a REBOL data structure.
			}

	History:	[
				0.0.0 [19-Jun-2011 {Start of project} "RM"]
				0.5.0 [24-Jun-2011 {First working version} "RM"]
				0.7.0 [27-Jun-2011 {Added file! and tuple! literals} "RM"]
				0.8.0 [27-Jun-2011 {Simplified the separator} "RM"]
				0.8.5 [29-Jun-2011 {Added the % operator} "RM"]
				0.9.0 [30-Jun-2011 {
					Separator reduced to stripping comments only,
					Grammar takes care of whitespace
				} "RM" ]	]
				1.0.0 [1-Jul-2011 {Grammar takes care of comments also} "RM"]

]

lex-grammar: context [

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; data structures
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	source: make block! 1000					; will contain lexed source
	level-stack: make block! 20					; to keep track of nesting
	insert/only level-stack source				; initialize
	current-word: none							; last word parsed
	current-integer: none						; last integer parsed
	current-sel: none							; last path selector parsed
	current-elem: none							; last element parsed

	del: #"^(7F)"

	char-table: reduce [
		"null"		null
		"line"		newline
		"tab"		tab
		"page"		newpage
		"esc"		escape
		"back"		backspace
		"del"		del
	]

	hex-digits:	"0123456789ABCDEF"

	ctrl-chars:	"@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]!_"

	max-neg-int:	to integer! #{80000000}
	max-neg-str:	"-2147483648"
	max-pos-str:	 "2147483647"

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; action handlers
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	handle-elem: does [
		; insert parsed element into source
		if current-elem [
			level-stack/1: insert/only tail level-stack/1 current-elem
			current-elem: none				; avoid multiple handling
		]
	]

	handle-block-start: has [source-ptr][
		; insert new block into source
		insert/only tail level-stack/1 source-ptr: make block! 20
		insert/only level-stack source-ptr
	]

	handle-paren-start: has [source-ptr][
		; insert new paren into source
		insert/only tail level-stack/1 source-ptr: make paren! 20
		insert/only level-stack source-ptr
	]

	store-word: func [s [string!]][
		unless current-word: attempt [to word! s][
			print ["error in word:" s]
		]
	]

	handle-tuple: func [s [string!]][
		unless current-elem: attempt [to tuple! s][
			print ["error in tuple:" s]
		]
	]

	store-integer: func [s [string!] /local res neg pow][
	; must treat largest negative integer separately to avoid math overflow
		if s = max-neg-str [current-integer: max-neg-int exit]
		if neg: #"-" = first s [s: next s]
		if any [
			10 < length? s
			all [10 = length? s max-pos-str < s]
		][
			print ["error in integer:" s]
			current-integer: none exit
		]
		reverse s
		res: 0 pow: 1
		while [not tail? s][
			res: (first s) - #"0" * pow + res
			pow: pow * 10
			s: next s
		]
		current-integer: either neg [negate res][res]
	]

	store-hex-integer: func [s [string!] /local res pos][
		; s is guaranteed to have length 2 4 or 8
		; and to consist of hex digits only
		res: 0 pos: 4 * (length? s) - 8
		while [not tail? s][
			res: res or shift/left decode-hex-byte s pos
			pos: pos - 8
			s: at s 3
		]
		current-integer: res
	]

	decode-hex-byte: func [s [string!]][
		16 * (index? find hex-digits first  s)
		   + (index? find hex-digits second s)
		   - 17		; - 16 - 1 to account for 1-origin
	]

	decode-string: func [
		; handle escaped quantities in character and string literals
		; input is raw string, output is decoded string
		; uses REBOL conventions, notably ^! = 1E (^^ doesn't work)
		; parser has guaranteed legality of escapes
		s [string!]
		/local res c chunk chend
	][
		if 1 >= length? s [return s]
		res: make string! length? s
		while [not tail? s][
			c: first s s: next s
			either c = #"^^"
			[
				c: first s s: next s
				either c = #"("
				[
					chunk: copy/part s chend: find s #")"
					insert tail res either 2 = length? chunk
					[
						to char! decode-hex-byte chunk
					][
						select char-table chunk
					]
					s: next chend
				][
					insert tail res case [
						c = #"/"	[newline]
						c = #"-"	[tab]
						c = #"~"	[del]
						chend: find ctrl-chars c [
							to char! (index? chend) - 1
						]
						true [c]
					]
				]
			][
				insert tail res c
			]
		]
		res
	]

	handle-file: func [s [string!]][
		unless current-elem: attempt [to file! s][
			print ["error in file:" s]
		]
	]

	store-path: has [source-ptr][
		; insert a block to collect head word and path selectors
		insert/only tail level-stack/1 source-ptr: make block! 10
		insert tail source-ptr current-word
		insert/only level-stack source-ptr
	]

	handle-path: has [source-ptr][
		; convert block into path (set-path handled at point of ":")
		source-ptr: head level-stack/1
		remove level-stack
		remove level-stack/1
		current-elem: to path! source-ptr
	]

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; grammar proper - order of rules and alternatives is important
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	program:	[
		any whitesp
		some [
			elem				(handle-elem)
			any whitesp
		]
	]

	elem:		[
	  "["						(handle-block-start)
	  any whitesp
	  any [ elem				(handle-elem)
			any whitesp
	  ]
	  "]"						(remove level-stack)
	| "("						(handle-paren-start)
	  any whitesp
	  any [ elem				(handle-elem)
			any whitesp
	  ]
	  ")"						(remove level-stack)
	| path-expr					; handled directly
	| hex-integer-literal		(current-elem: current-integer)
	| set-word-literal			(current-elem: to set-word! current-word)
	| refinement-literal		(current-elem: to refinement! current-word)
	| tuple-literal				; handled directly
	| file-literal				; handled directly
	| integer-literal			(current-elem: current-integer)
	| regular-word				(current-elem: current-word)
	| special-word				(current-elem: current-word)
	| get-word-literal			(current-elem: to get-word! current-word)
	| byte-literal				; handled directly
	| issue-literal				(current-elem: to issue! current-word)
	| string-literal			; handled directly
	]

	path-expr:			[
	  regular-word "/"			(store-path)
	  sel-expr					(level-stack/1: insert tail level-stack/1 current-sel)
	  any [ "/" sel-expr		(level-stack/1: insert tail level-stack/1 current-sel)
	  ]							(handle-path)
	  opt [":"					(current-elem: to set-path! current-elem)
	  ]
  	]

	sel-expr:			[
	  hex-integer-literal		(current-sel: current-integer)
	| regular-word				(current-sel: current-word)
	| integer-literal			(current-sel: current-integer)
	]

	hex-integer-literal:	[
	  b: [ 8 hexchar | 4 hexchar | 2 hexchar ] e: "h"
	  							(store-hex-integer copy/part b e)
	]

	set-word-literal:	[ regular-word ":" ]
	get-word-literal:	[ ":" regular-word ]
	refinement-literal:	[ "/" regular-word ]
	tuple-literal:		[ b: [ some digit "." some digit some [ "." some digit ]] e:
								(handle-tuple copy/part b e)
	]

	integer-literal:	[ b: opt [ "+" | "-" ] some digit e:
								(store-integer copy/part b e)
	]
	regular-word:		[ b: wstart any wchar e:
								(store-word copy/part b e)
	]
	special-word:		[ b: ["<=" | "<>" | "<" | ">=" | ">" | "+" | "-" | "*" | "//" | "/"] e:
								(store-word copy/part b e )
	  | "%" 					(store-word "///")
	]

	byte-literal:		[ {#"} b: qchar e: {"}
								(current-elem: first decode-string copy/part b e)
	]
	issue-literal:		[ "#" regular-word ]
	string-literal:		[ [ {"} b: any qchar e: {"} | "{" b: any bchar e: "}" ]
								(current-elem: decode-string copy/part b e)
	]
	file-literal:		[ "%" b: some fchar e:
								(handle-file copy/part b e)
	]

	; auxiliary definitions: character (sub-)sets

	whitesp: [ #" " | #"^-" | #"^/" | #"^M" | #";" any non-lf #"^/" ]

	ascii:		charset [ #"^(00)" - #"^(FF)" ]			; full set
	non-lf:		exclude ascii charset "^/"
	inqt:		exclude ascii charset "^"^^^/^M"		; in " ", not escaped
	inbr:		exclude ascii charset "^}^^"			; in { }, not escaped

	alpha:		charset [ #"A" - #"Z" #"a" - #"z" ]
	digit:		charset [ #"0" - #"9" ]
	alfanum:	union alpha digit
	hexchar:	charset hex-digits


	pchar:		exclude ascii charset [ #"^(00)" - #"^(1F)" ]	; printable

	qchar:		[ inqt | echar ]						; in " " and #" "

	bchar:		[ inbr | echar ]						; in { }

	echar:		[										; escaped char
	  "^^(" [ "null" | "line" | "tab" | "page" | "esc" | "back" | "del" ] ")"
	| "^^(" 2 hexchar ")"
	| "^^" pchar
	]

	fchar: union alfanum charset "-/."

	wchar:		union alfanum charset "!&'*+-.=?_`|~"	; in word
	wstart:		exclude wchar union digit charset "'"	; starting word

]
	REBOL [
	Title: "Grammar for Red/System lexical analysis"
	Date: 1-Jul-2011
	Name: "reds-lex-grammar"

	Version: 1.0.0
	File: %/G/Projects/Common/RED/red-system/sources/reds-lexer/reds-lex-grammar.r
	Home: http://users.telenet.be/rwmeijer

	Author: "Rudolf W. Meijer"
	Rights: "Copyright (C) 2011 Rudolf W. Meijer. All Rights Reserved"

	Content: {This grammar contains parse rules which will accept
	lexically valid Red/System expressions, and actions
	to store the parsed source as a REBOL data structure.
	}

	History: [
	0.0.0 [19-Jun-2011 {Start of project} "RM"]
	0.5.0 [24-Jun-2011 {First working version} "RM"]
	0.7.0 [27-Jun-2011 {Added file! and tuple! literals} "RM"]
	0.8.0 [27-Jun-2011 {Simplified the separator} "RM"]
	0.8.5 [29-Jun-2011 {Added the % operator} "RM"]
	0.9.0 [30-Jun-2011 {
	Separator reduced to stripping comments only,
	Grammar takes care of whitespace
	} "RM" ] ]
	1.0.0 [1-Jul-2011 {Grammar takes care of comments also} "RM"]

	]

	lex-grammar: context [

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; data structures
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	source: make block! 1000 ; will contain lexed source
	level-stack: make block! 20 ; to keep track of nesting
	insert/only level-stack source ; initialize
	current-word: none ; last word parsed
	current-integer: none ; last integer parsed
	current-sel: none ; last path selector parsed
	current-elem: none ; last element parsed

	del: #"^(7F)"

	char-table: reduce [
	"null" null
	"line" newline
	"tab" tab
	"page" newpage
	"esc" escape
	"back" backspace
	"del" del
	]

	hex-digits: "0123456789ABCDEF"

	ctrl-chars: "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]!_"

	max-neg-int: to integer! #{80000000}
	max-neg-str: "-2147483648"
	max-pos-str: "2147483647"

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; action handlers
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	handle-elem: does [
	; insert parsed element into source
	if current-elem [
	level-stack/1: insert/only tail level-stack/1 current-elem
	current-elem: none ; avoid multiple handling
	]
	]

	handle-block-start: has [source-ptr][
	; insert new block into source
	insert/only tail level-stack/1 source-ptr: make block! 20
	insert/only level-stack source-ptr
	]

	handle-paren-start: has [source-ptr][
	; insert new paren into source
	insert/only tail level-stack/1 source-ptr: make paren! 20
	insert/only level-stack source-ptr
	]

	store-word: func [s [string!]][
	unless current-word: attempt [to word! s][
	print ["error in word:" s]
	]
	]

	handle-tuple: func [s [string!]][
	unless current-elem: attempt [to tuple! s][
	print ["error in tuple:" s]
	]
	]

	store-integer: func [s [string!] /local res neg pow][
	; must treat largest negative integer separately to avoid math overflow
	if s = max-neg-str [current-integer: max-neg-int exit]
	if neg: #"-" = first s [s: next s]
	if any [
	10 < length? s
	all [10 = length? s max-pos-str < s]
	][
	print ["error in integer:" s]
	current-integer: none exit
	]
	reverse s
	res: 0 pow: 1
	while [not tail? s][
	res: (first s) - #"0" * pow + res
	pow: pow * 10
	s: next s
	]
	current-integer: either neg [negate res][res]
	]

	store-hex-integer: func [s [string!] /local res pos][
	; s is guaranteed to have length 2 4 or 8
	; and to consist of hex digits only
	res: 0 pos: 4 * (length? s) - 8
	while [not tail? s][
	res: res or shift/left decode-hex-byte s pos
	pos: pos - 8
	s: at s 3
	]
	current-integer: res
	]

	decode-hex-byte: func [s [string!]][
	16 * (index? find hex-digits first s)
	+ (index? find hex-digits second s)
	- 17 ; - 16 - 1 to account for 1-origin
	]

	decode-string: func [
	; handle escaped quantities in character and string literals
	; input is raw string, output is decoded string
	; uses REBOL conventions, notably ^! = 1E (^^ doesn't work)
	; parser has guaranteed legality of escapes
	s [string!]
	/local res c chunk chend
	][
	if 1 >= length? s [return s]
	res: make string! length? s
	while [not tail? s][
	c: first s s: next s
	either c = #"^^"
	[
	c: first s s: next s
	either c = #"("
	[
	chunk: copy/part s chend: find s #")"
	insert tail res either 2 = length? chunk
	[
	to char! decode-hex-byte chunk
	][
	select char-table chunk
	]
	s: next chend
	][
	insert tail res case [
	c = #"/" [newline]
	c = #"-" [tab]
	c = #"~" [del]
	chend: find ctrl-chars c [
	to char! (index? chend) - 1
	]
	true [c]
	]
	]
	][
	insert tail res c
	]
	]
	res
	]

	handle-file: func [s [string!]][
	unless current-elem: attempt [to file! s][
	print ["error in file:" s]
	]
	]

	store-path: has [source-ptr][
	; insert a block to collect head word and path selectors
	insert/only tail level-stack/1 source-ptr: make block! 10
	insert tail source-ptr current-word
	insert/only level-stack source-ptr
	]

	handle-path: has [source-ptr][
	; convert block into path (set-path handled at point of ":")
	source-ptr: head level-stack/1
	remove level-stack
	remove level-stack/1
	current-elem: to path! source-ptr
	]

	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; grammar proper - order of rules and alternatives is important
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	program: [
	any whitesp
	some [
	elem (handle-elem)
	any whitesp
	]
	]

	elem: [
	"[" (handle-block-start)
	any whitesp
	any [ elem (handle-elem)
	any whitesp
	]
	"]" (remove level-stack)
	\| "(" (handle-paren-start)
	any whitesp
	any [ elem (handle-elem)
	any whitesp
	]
	")" (remove level-stack)
	\| path-expr ; handled directly
	\| hex-integer-literal (current-elem: current-integer)
	\| set-word-literal (current-elem: to set-word! current-word)
	\| refinement-literal (current-elem: to refinement! current-word)
	\| tuple-literal ; handled directly
	\| file-literal ; handled directly
	\| integer-literal (current-elem: current-integer)
	\| regular-word (current-elem: current-word)
	\| special-word (current-elem: current-word)
	\| get-word-literal (current-elem: to get-word! current-word)
	\| byte-literal ; handled directly
	\| issue-literal (current-elem: to issue! current-word)
	\| string-literal ; handled directly
	]

	path-expr: [
	regular-word "/" (store-path)
	sel-expr (level-stack/1: insert tail level-stack/1 current-sel)
	any [ "/" sel-expr (level-stack/1: insert tail level-stack/1 current-sel)
	] (handle-path)
	opt [":" (current-elem: to set-path! current-elem)
	]
	]

	sel-expr: [
	hex-integer-literal (current-sel: current-integer)
	\| regular-word (current-sel: current-word)
	\| integer-literal (current-sel: current-integer)
	]

	hex-integer-literal: [
	b: [ 8 hexchar \| 4 hexchar \| 2 hexchar ] e: "h"
	(store-hex-integer copy/part b e)
	]

	set-word-literal: [ regular-word ":" ]
	get-word-literal: [ ":" regular-word ]
	refinement-literal: [ "/" regular-word ]
	tuple-literal: [ b: [ some digit "." some digit some [ "." some digit ]] e:
	(handle-tuple copy/part b e)
	]

	integer-literal: [ b: opt [ "+" \| "-" ] some digit e:
	(store-integer copy/part b e)
	]
	regular-word: [ b: wstart any wchar e:
	(store-word copy/part b e)
	]
	special-word: [ b: ["<=" \| "<>" \| "<" \| ">=" \| ">" \| "+" \| "-" \| "*" \| "//" \| "/"] e:
	(store-word copy/part b e )
	\| "%" (store-word "///")
	]

	byte-literal: [ {#"} b: qchar e: {"}
	(current-elem: first decode-string copy/part b e)
	]
	issue-literal: [ "#" regular-word ]
	string-literal: [ [ {"} b: any qchar e: {"} \| "{" b: any bchar e: "}" ]
	(current-elem: decode-string copy/part b e)
	]
	file-literal: [ "%" b: some fchar e:
	(handle-file copy/part b e)
	]

	; auxiliary definitions: character (sub-)sets

	whitesp: [ #" " \| #"^-" \| #"^/" \| #"^M" \| #";" any non-lf #"^/" ]

	ascii: charset [ #"^(00)" - #"^(FF)" ] ; full set
	non-lf: exclude ascii charset "^/"
	inqt: exclude ascii charset "^"^^^/^M" ; in " ", not escaped
	inbr: exclude ascii charset "^}^^" ; in { }, not escaped

	alpha: charset [ #"A" - #"Z" #"a" - #"z" ]
	digit: charset [ #"0" - #"9" ]
	alfanum: union alpha digit
	hexchar: charset hex-digits


	pchar: exclude ascii charset [ #"^(00)" - #"^(1F)" ] ; printable

	qchar: [ inqt \| echar ] ; in " " and #" "

	bchar: [ inbr \| echar ] ; in { }

	echar: [ ; escaped char
	"^^(" [ "null" \| "line" \| "tab" \| "page" \| "esc" \| "back" \| "del" ] ")"
	\| "^^(" 2 hexchar ")"
	\| "^^" pchar
	]

	fchar: union alfanum charset "-/."

	wchar: union alfanum charset "!&'*+-.=?_`\|~" ; in word
	wstart: exclude wchar union digit charset "'" ; starting word

	]