iArnold/%grep.red

## %grep.red
Red [
	Title: 		"Partial grep implementation"
	Purpose:	{To search the input for lines containing a match
				 to the given pattern, specified as a regular expression}
	Author:		"Rudolf W. MEIJER"
	File:		%grep.red
	Version:	0.4.0
	Date:		"24-Nov-2016"
	Rights:		"(c) Copyright 2016 Rudolf W. MEIJER"
	History:	[
					[0.0.0  "2-Nov-2017" {Start of project}]
					[0.1.0  "9-Nov-2017" {First incomplete working version}]
					[0.2.0 "22-Nov-2016" {Ranges and repetition implemented}]
					[0.3.0 "23-Nov-2016" {Refactoring}]
					[0.4.0 "24-Nov-2017" {Added /quiet option}]
				]
	Notes:		{see GNU Grep 2.26, http://www.gnu.org/software/grep/manual/grep,
				 also the grep(1) Linux man page, e.g. https://linux.die.net/man/1/grep,
				 and https://en.wikipedia.org/wiki/Regular_expression
				}
	Language:	'English
]

;---|----1----|----2----|----3----|----4----|----5----|----6----|----7----|-

grep-impl: context [

	; charset constants
	_ASCII:  charset [#"^(00)" - #"^(7F)"]
	_upper:  charset [#"A" - #"Z"]
	_lower:  charset [#"a" - #"z"]
	_alpha:  union _upper _lower
	_digit:  charset [#"0" - #"9"]
	_xdigit: union _digit charset [#"A" - #"F" #"a" - #"f"]
	_alnum:  union _alpha _digit
	_blank:  charset " ^-"
	_space:  union _blank charset [#"^(0A)" - #"^(0D)"]
	_cntrl:  charset [#"^(00)" - #"^(1F)" #"^(7F)"]
	_punct:  charset {!"#$%&'()*+,-./:;<=>?@[\]^^_`{|}~}
	_graph:  union _alnum _punct
	_print:  union _graph charset " "
	_meta:   charset "|()?*+.[]{}\"
	_escape: charset "|()?*+.[{\"
	_atom:   intersect _ASCII complement _meta

	char-classes: reduce [
		"upper"  _upper
		"lower"  _lower
		"alpha"  _alpha
		"digit"  _digit
		"xdigit" _xdigit
		"alnum"  _alnum
		"blank"  _blank
		"space"  _space
		"cntrl"  _cntrl
		"punct"  _punct
		"graph"  _graph
		"print"  _print
	]

	rules: none

	options: context [
		case: none
		invert: none
		count: none
		quiet: none
	]

	stats: context [
		files: 0
		lines: 0
		matched: 0
		time: 0
	]

	make-rules: func [
		{takes a regex and constructs the corresponding Red parser rules
		 which will do the matching; returns the rules}
		regex [string!] "the pattern (regex)"
		/local res pattern rule-stk rng-nr limits  ll lo hi fch frst rng r qf
		 ch nr cr el lc rst class clb chset escaped compl brexp
	][
		unless parse regex [some _ASCII] [
			print "pattern error, non-ASCII character(s) found"
			return none
		]
		; set up
		pattern: copy regex
		rule-stk: copy []
		rng-nr: 0
		res: copy []
		insert/only rule-stk res
		escaped: false
		; main loop
		while [not tail? pattern][
			cr: first rule-stk	; current rule
			ch: take pattern
			case [
				any [
					escaped
					find _atom ch
					all [#")" = ch 1 = length? rule-stk]
					all [#"\" = ch tail? pattern]
				][
					case [
						any [
							empty? cr
							not string? last cr
						][
							insert tail cr form ch
						]
						string? last cr [
							insert tail last cr ch
						]
					]
					escaped: false
				]
				#"\" = ch [
					unless find _escape first pattern [
						print ["pattern error: meta character expected after \, found" first pattern]
						return none
					]
					escaped: true
				]
				#"." = ch [
					insert tail cr 'skip
				]
				#"(" = ch [
					insert/only tail cr nr: copy []
					insert/only rule-stk nr		; new rule
				]
				#")" = ch [
						remove rule-stk
				]
				#"|" = ch [
					insert tail cr '|
				]
				#"[" = ch [
					fch: none
					frst: true
					compl: false
					rng: false
					brexp: make bitset! 128
					while [not tail? pattern][
						ch: take pattern
						if all [frst #"^^" = ch][compl: true frst: false continue]
						if all [not frst #"]" = ch][ break]
						frst: false
						either all [not rng #"-" = ch]
						[
							rng: true
						][
							either rng
							[
								either ch > fch
								[
									brexp: union brexp charset compose [(fch) - (ch)]
									rng: false
									fch: none
								][
									print ["pattern error: range inverted" fch "-" ch]
									return none
								]
							][
								either #"["= ch
								[
									either #":" = first pattern
									[
										remove pattern
										unless clb: find pattern ":]" [		; closing bracket
											print ["pattern error: expected :], found" pattern]
											return none
										]
										class: take/part pattern clb
										remove/part pattern 2
										unless chset: select char-classes class [
											print ["pattern error: wrong character class" class]
											return none
										]
										brexp: union brexp chset
									][
										print ["pattern error: expected :, found" first pattern]
										return none
									]
								][
									either fch
									[
										brexp: union brexp charset fch
										fch: none
									][
										fch: ch
									]
									brexp: union brexp charset ch
								]
							]
						]
					]
					either #"]" <> ch
					[
						print "pattern error: range not closed"
						return none
					][
						if compl [brexp: complement brexp]
						rng-nr: rng-nr + 1
						r: to word! rejoin ["_range" rng-nr]
						set r brexp
						insert tail cr r
					]
				]
				#"{" = ch [
					unless clb: find pattern #"}" [
						print ["pattern error: expected }, found" pattern]
						return none
					]
					limits: split take/part pattern clb #","
					ll: length? limits
					if any [ll < 1 ll > 2][
						print ["pattern error, expected one or two numbers, found" limits]
						return none
					]
					unless lo: attempt [to integer! limits/1] [
						print ["pattern error: expected number, found" limits/1]
						return none
					]
					unless lo >= 0 [
						print ["pattern error: negative number" lo]
						return none
					]
					if 2 = ll [
						unless hi: attempt [to integer! limits/2] [
							print ["pattern error: expected number, found" limits/2]
							return none
						]
						unless hi > lo [
							print ["pattern error: upper bound lower than lower one" hi]
							return none
						]
					]
					el: last cr
					either string? el
					[
						lc: last el
						rst: head remove back tail el
						either empty? rst
						[
							remove back tail cr
						][
							change back tail cr form rst
						]
						insert tail cr lo
						if ll = 2 [insert tail cr hi]
						insert tail cr form lc
					][
						insert back tail cr lo
						if ll = 2 [insert back tail cr hi]
					]
				]
				any [
					#"?" = ch #"*" = ch #"+" = ch
				][
					qf: switch ch [
						#"?" ['opt]
						#"*" ['any]
						#"+" ['some]
					]
					el: last cr
					either string? el
					[
						lc: last el
						rst: head remove back tail el
						either empty? rst
						[
							remove back tail cr
						][
							change back tail cr form rst
						]
						insert tail cr qf
						insert tail cr form lc
					][
						insert back tail cr qf
					]
				]
			]
;			print "after"
;			print mold res
		]
		; final check for balanced ( )
		unless 1 = length? rule-stk [
			print "pattern error: unmatched parentheses"
			return none
		]
		copy res
	]

	set 'grep func [
		{partial implementation of Unix/Linux grep pattern matching engine
documentation consulted: GNU Grep 2.26 and grep(1) - Linux man page}
		pattern [string!] "the pattern (extended regular expression) to match"
		input [string! file! url! block!] {a single string, possibly containing
newlines, a file, a url or a block of strings, or files and/or urls}
		/case		"case-sensitive comparison"
		/invert		"print only non-matching lines"
		/count		"print a count of (non-)matching lines only"
		/quiet		"suppress printing when no lines found"
		/local fst tm
	][
		if empty? input [
			print "empty input"
			exit
		]
		if empty? pattern [
			print "error, no pattern specified"
			exit
		]
		unless rules: make-rules pattern [
			print "error in rules"
			exit
		]
		rules: head insert/only next copy [thru to end] rules
		options/case: case
		options/invert: invert
		options/count: count
		options/quiet: quiet
		stats/lines: 0
		stats/files: 0
		stats/matched: 0
		stats/time: now/time/precise
		switch type?/word input [
			string! [
				report split input #"^/" "matching text lines"
			]
			file! [
				either #"/" = first input
				[
					process reduce [input] %""
				][
					process reduce [input] what-dir
				]
			]
			url! [
				process reduce [input] %""
			]
			block! [
				fst: first input
				unless any [string? fst file? fst url? fst][
					print ["error, string, file or url expected:" fst]
					exit
				]
				either string? fst
				[
					report input "matching text lines"
				][
					process input what-dir
				]
			]
		]
		tm: to integer! now/time/precise - stats/time * 1000
		prin ["done, checked" stats/lines "lines in "]
		unless stats/files = 0 [
			prin [stats/files "files/urls in "]
		]
		print [tm "msec, found" stats/matched "matches"]
	]

	process: func [
		{process a block of files/urls;
if some of the files are directories,
process the files in them recursively}
		input [block!]	"block of files/urls"
		pdir [file!]	"parent directory for files"
		/local lines src
	][
		foreach src input [
			if file? src [src: append copy pdir src]
			case [
				any [
					url? src
					all [file? src not dir? src]
				][
					either lines: attempt [read/lines src]
					[
						report lines ["matching" fmt length? lines 5 "lines from" mold src]
					][
						unless options/quiet [
							print [mold src "cannot be read, skipped"]
						]
					]
				]
				file? src [ ; this is a dir!!!
					process read src src
				]
				true [
					print ["error, file or url expected, found:" type? src]
				]
			]
		]
	]

	report: func [
		{match and report according to options}
		lines [block!]			"block of lines to match"
		text [string! block!]	"heading for each string/file/url"
		/local buffer line res line-count i
	][
		if any [not lines empty? lines ][exit]
		if block? text [stats/files: stats/files + 1]
		stats/lines: stats/lines + length? lines
		buffer: copy []
		line-count: 0
		repeat i length? lines [
			line: lines/:i
			unless string? line [
				print ["error, string expected, found:" type? line]
				exit
			]
			res: either options/case [parse/case line rules][parse line rules]
			if options/invert [res: not res]
			if res [
				line-count: line-count + 1
				insert tail buffer rejoin [fmt i 5 " " line]
			]
		]
		stats/matched: stats/matched + line-count
		if any [
			line-count <> 0
			not options/quiet
		][
			print text
			either options/count
			[
				print [fmt line-count 5 "matching line(s) found"]
			][
				repeat i length? buffer [print buffer/:i]
			]
		]
	]

	fmt: func [
		{convert an integer to a right aligned string of given size;
pad with blanks on left; if too narrow, show ***}
		i [integer!]	"the integer to convert"
		s [integer!]	"the size in characters"
		/local f lf
	][
		s: max 1 s
		f: form i
		lf: length? f
		either lf <= s
		[
			f: head insert/dup f #" " s - lf
		][
			f: head insert/dup copy "" #"*" s
		]
	]
]
	Red [
	Title: "Partial grep implementation"
	Purpose: {To search the input for lines containing a match
	to the given pattern, specified as a regular expression}
	Author: "Rudolf W. MEIJER"
	File: %grep.red
	Version: 0.4.0
	Date: "24-Nov-2016"
	Rights: "(c) Copyright 2016 Rudolf W. MEIJER"
	History: [
	[0.0.0 "2-Nov-2017" {Start of project}]
	[0.1.0 "9-Nov-2017" {First incomplete working version}]
	[0.2.0 "22-Nov-2016" {Ranges and repetition implemented}]
	[0.3.0 "23-Nov-2016" {Refactoring}]
	[0.4.0 "24-Nov-2017" {Added /quiet option}]
	]
	Notes: {see GNU Grep 2.26, http://www.gnu.org/software/grep/manual/grep,
	also the grep(1) Linux man page, e.g. https://linux.die.net/man/1/grep,
	and https://en.wikipedia.org/wiki/Regular_expression
	}
	Language: 'English
	]

	;---\|----1----\|----2----\|----3----\|----4----\|----5----\|----6----\|----7----\|-

	grep-impl: context [

	; charset constants
	_ASCII: charset [#"^(00)" - #"^(7F)"]
	_upper: charset [#"A" - #"Z"]
	_lower: charset [#"a" - #"z"]
	_alpha: union _upper _lower
	_digit: charset [#"0" - #"9"]
	_xdigit: union _digit charset [#"A" - #"F" #"a" - #"f"]
	_alnum: union _alpha _digit
	_blank: charset " ^-"
	_space: union _blank charset [#"^(0A)" - #"^(0D)"]
	_cntrl: charset [#"^(00)" - #"^(1F)" #"^(7F)"]
	_punct: charset {!"#$%&'()*+,-./:;<=>?@[\]^^_`{\|}~}
	_graph: union _alnum _punct
	_print: union _graph charset " "
	_meta: charset "\|()?*+.[]{}\"
	_escape: charset "\|()?*+.[{\"
	_atom: intersect _ASCII complement _meta

	char-classes: reduce [
	"upper" _upper
	"lower" _lower
	"alpha" _alpha
	"digit" _digit
	"xdigit" _xdigit
	"alnum" _alnum
	"blank" _blank
	"space" _space
	"cntrl" _cntrl
	"punct" _punct
	"graph" _graph
	"print" _print
	]

	rules: none

	options: context [
	case: none
	invert: none
	count: none
	quiet: none
	]

	stats: context [
	files: 0
	lines: 0
	matched: 0
	time: 0
	]

	make-rules: func [
	{takes a regex and constructs the corresponding Red parser rules
	which will do the matching; returns the rules}
	regex [string!] "the pattern (regex)"
	/local res pattern rule-stk rng-nr limits ll lo hi fch frst rng r qf
	ch nr cr el lc rst class clb chset escaped compl brexp
	][
	unless parse regex [some _ASCII] [
	print "pattern error, non-ASCII character(s) found"
	return none
	]
	; set up
	pattern: copy regex
	rule-stk: copy []
	rng-nr: 0
	res: copy []
	insert/only rule-stk res
	escaped: false
	; main loop
	while [not tail? pattern][
	cr: first rule-stk ; current rule
	ch: take pattern
	case [
	any [
	escaped
	find _atom ch
	all [#")" = ch 1 = length? rule-stk]
	all [#"\" = ch tail? pattern]
	][
	case [
	any [
	empty? cr
	not string? last cr
	][
	insert tail cr form ch
	]
	string? last cr [
	insert tail last cr ch
	]
	]
	escaped: false
	]
	#"\" = ch [
	unless find _escape first pattern [
	print ["pattern error: meta character expected after \, found" first pattern]
	return none
	]
	escaped: true
	]
	#"." = ch [
	insert tail cr 'skip
	]
	#"(" = ch [
	insert/only tail cr nr: copy []
	insert/only rule-stk nr ; new rule
	]
	#")" = ch [
	remove rule-stk
	]
	#"\|" = ch [
	insert tail cr '\|
	]
	#"[" = ch [
	fch: none
	frst: true
	compl: false
	rng: false
	brexp: make bitset! 128
	while [not tail? pattern][
	ch: take pattern
	if all [frst #"^^" = ch][compl: true frst: false continue]
	if all [not frst #"]" = ch][ break]
	frst: false
	either all [not rng #"-" = ch]
	[
	rng: true
	][
	either rng
	[
	either ch > fch
	[
	brexp: union brexp charset compose [(fch) - (ch)]
	rng: false
	fch: none
	][
	print ["pattern error: range inverted" fch "-" ch]
	return none
	]
	][
	either #"["= ch
	[
	either #":" = first pattern
	[
	remove pattern
	unless clb: find pattern ":]" [ ; closing bracket
	print ["pattern error: expected :], found" pattern]
	return none
	]
	class: take/part pattern clb
	remove/part pattern 2
	unless chset: select char-classes class [
	print ["pattern error: wrong character class" class]
	return none
	]
	brexp: union brexp chset
	][
	print ["pattern error: expected :, found" first pattern]
	return none
	]
	][
	either fch
	[
	brexp: union brexp charset fch
	fch: none
	][
	fch: ch
	]
	brexp: union brexp charset ch
	]
	]
	]
	]
	either #"]" <> ch
	[
	print "pattern error: range not closed"
	return none
	][
	if compl [brexp: complement brexp]
	rng-nr: rng-nr + 1
	r: to word! rejoin ["_range" rng-nr]
	set r brexp
	insert tail cr r
	]
	]
	#"{" = ch [
	unless clb: find pattern #"}" [
	print ["pattern error: expected }, found" pattern]
	return none
	]
	limits: split take/part pattern clb #","
	ll: length? limits
	if any [ll < 1 ll > 2][
	print ["pattern error, expected one or two numbers, found" limits]
	return none
	]
	unless lo: attempt [to integer! limits/1] [
	print ["pattern error: expected number, found" limits/1]
	return none
	]
	unless lo >= 0 [
	print ["pattern error: negative number" lo]
	return none
	]
	if 2 = ll [
	unless hi: attempt [to integer! limits/2] [
	print ["pattern error: expected number, found" limits/2]
	return none
	]
	unless hi > lo [
	print ["pattern error: upper bound lower than lower one" hi]
	return none
	]
	]
	el: last cr
	either string? el
	[
	lc: last el
	rst: head remove back tail el
	either empty? rst
	[
	remove back tail cr
	][
	change back tail cr form rst
	]
	insert tail cr lo
	if ll = 2 [insert tail cr hi]
	insert tail cr form lc
	][
	insert back tail cr lo
	if ll = 2 [insert back tail cr hi]
	]
	]
	any [
	#"?" = ch #"*" = ch #"+" = ch
	][
	qf: switch ch [
	#"?" ['opt]
	#"*" ['any]
	#"+" ['some]
	]
	el: last cr
	either string? el
	[
	lc: last el
	rst: head remove back tail el
	either empty? rst
	[
	remove back tail cr
	][
	change back tail cr form rst
	]
	insert tail cr qf
	insert tail cr form lc
	][
	insert back tail cr qf
	]
	]
	]
	; print "after"
	; print mold res
	]
	; final check for balanced ( )
	unless 1 = length? rule-stk [
	print "pattern error: unmatched parentheses"
	return none
	]
	copy res
	]

	set 'grep func [
	{partial implementation of Unix/Linux grep pattern matching engine
	documentation consulted: GNU Grep 2.26 and grep(1) - Linux man page}
	pattern [string!] "the pattern (extended regular expression) to match"
	input [string! file! url! block!] {a single string, possibly containing
	newlines, a file, a url or a block of strings, or files and/or urls}
	/case "case-sensitive comparison"
	/invert "print only non-matching lines"
	/count "print a count of (non-)matching lines only"
	/quiet "suppress printing when no lines found"
	/local fst tm
	][
	if empty? input [
	print "empty input"
	exit
	]
	if empty? pattern [
	print "error, no pattern specified"
	exit
	]
	unless rules: make-rules pattern [
	print "error in rules"
	exit
	]
	rules: head insert/only next copy [thru to end] rules
	options/case: case
	options/invert: invert
	options/count: count
	options/quiet: quiet
	stats/lines: 0
	stats/files: 0
	stats/matched: 0
	stats/time: now/time/precise
	switch type?/word input [
	string! [
	report split input #"^/" "matching text lines"
	]
	file! [
	either #"/" = first input
	[
	process reduce [input] %""
	][
	process reduce [input] what-dir
	]
	]
	url! [
	process reduce [input] %""
	]
	block! [
	fst: first input
	unless any [string? fst file? fst url? fst][
	print ["error, string, file or url expected:" fst]
	exit
	]
	either string? fst
	[
	report input "matching text lines"
	][
	process input what-dir
	]
	]
	]
	tm: to integer! now/time/precise - stats/time * 1000
	prin ["done, checked" stats/lines "lines in "]
	unless stats/files = 0 [
	prin [stats/files "files/urls in "]
	]
	print [tm "msec, found" stats/matched "matches"]
	]

	process: func [
	{process a block of files/urls;
	if some of the files are directories,
	process the files in them recursively}
	input [block!] "block of files/urls"
	pdir [file!] "parent directory for files"
	/local lines src
	][
	foreach src input [
	if file? src [src: append copy pdir src]
	case [
	any [
	url? src
	all [file? src not dir? src]
	][
	either lines: attempt [read/lines src]
	[
	report lines ["matching" fmt length? lines 5 "lines from" mold src]
	][
	unless options/quiet [
	print [mold src "cannot be read, skipped"]
	]
	]
	]
	file? src [ ; this is a dir!!!
	process read src src
	]
	true [
	print ["error, file or url expected, found:" type? src]
	]
	]
	]
	]

	report: func [
	{match and report according to options}
	lines [block!] "block of lines to match"
	text [string! block!] "heading for each string/file/url"
	/local buffer line res line-count i
	][
	if any [not lines empty? lines ][exit]
	if block? text [stats/files: stats/files + 1]
	stats/lines: stats/lines + length? lines
	buffer: copy []
	line-count: 0
	repeat i length? lines [
	line: lines/:i
	unless string? line [
	print ["error, string expected, found:" type? line]
	exit
	]
	res: either options/case [parse/case line rules][parse line rules]
	if options/invert [res: not res]
	if res [
	line-count: line-count + 1
	insert tail buffer rejoin [fmt i 5 " " line]
	]
	]
	stats/matched: stats/matched + line-count
	if any [
	line-count <> 0
	not options/quiet
	][
	print text
	either options/count
	[
	print [fmt line-count 5 "matching line(s) found"]
	][
	repeat i length? buffer [print buffer/:i]
	]
	]
	]

	fmt: func [
	{convert an integer to a right aligned string of given size;
	pad with blanks on left; if too narrow, show ***}
	i [integer!] "the integer to convert"
	s [integer!] "the size in characters"
	/local f lf
	][
	s: max 1 s
	f: form i
	lf: length? f
	either lf <= s
	[
	f: head insert/dup f #" " s - lf
	][
	f: head insert/dup copy "" #"*" s
	]
	]
	]