laynor/combinators.dyalog

## combinators.dyalog
 :Namespace p
   ⍝ Parsers take a stream of token and return three values:
   ⍝ success word rest ← parser 'my sentence'
   ⍝ success: 1 (success) or 0 (failure)

   ⍝ Coding convention: parsing result
   ⍝ s r R ← parser input
   ⍝ s: status, can be either Ok (1) or Fail (0)
   ⍝ r: result, any value in case of success, ⍬ in case of failure
   ⍝ R: rest, the remaining input to be parsed.

   ⍝ Documentation conventions:
   ⍝ P : a parser, that is, a function that takes a vector
   ⍝     of tokens and returns the triple (s r R)

   Ok←1
   Fail←0

   ⍝ pred terminal : scans and returns one token T if (pred T), fails otherwise
   terminal←{                   ⍝ ⍵ is an array of tokens, ⍺⍺ a predicate
     ⍺⍺ 1↑⍵: Ok (1↑⍵) (1↓⍵)     ⍝ if (⍺⍺ ⍵[1]) [Ok w[1] ⍵[1..]]
             Fail ⍬ ⍵                   ⍝ else fail empty ⍵
   }

   ⍝ implemented as operator so that it is correct to write
   ⍝ parser ← 'X' tok
   tok←{(=∘⍺⍺) terminal ⍵}

   ⍝ f map p : maps f on the result of p
   map←{                        ⍝ ⍵⍵ is a parser function
     s r R←⍵⍵ ⍵                 ⍝ ⍵ the usual array of tokens
     s (⍺⍺ r) R                 ⍝ ⍺⍺ is a function that maps tokens to something else
   }

   ⍝ x cmap parser : shorthand for {x} map parser
   cmap←{x←⍺ ◊ {x} map ⍺⍺ ⍵}

   ⍝ parser many : builds a new parser that parses 0 or many occurrences of parser
   many←{
     P←⍺⍺                       ⍝
     s r R1←⍺⍺ ⍵

     s: {
       s rs R2←P many R1
       Ok (r,rs) R2
     }⍬
     Ok r R1
   }

   seq←{
     p1←⍺⍺
     p2←⍵⍵
     inp←⍵
     s1 r1 R1←p1 inp

     s1:{
       s2 r2 R2←p2 R1

       s2: 1 (r1 r2) R2
           Fail ⍬ inp
     }⍬
     Fail ⍬ inp
   }

   ⍝ Can parentheses be simplified?
   some←{((⊃,/) map (⍺⍺ seq (⍺⍺ many))) ⍵}

   alt←{
     s r R←⍺⍺ ⍵

     s: s r R
        ⍵⍵ ⍵
   }

   skip←{{⍬} map ⍺⍺ ⍵}

   ⍝ Grammar
   ⍝ -------

   ⍝ Special chars
   NL←⎕ucs 13
   TAB←⎕ucs 9
   SPC←' '
   WS←TAB,SPC
   WSNL←TAB,WS

   isAlpha←{⍵∊'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'}
   isDigit←{⍵∊'0123456789'}
   isAlphaNum←isAlpha∨isDigit

   ⍝ Lexing - basic chars
   digit←isDigit terminal
   alpha←isAlpha terminal
   alphaNum←isAlphaNum terminal
   newline←NL tok
   whitespace←(∊∘WSNL) terminal

   ⍝ terminals
   ⍝ ---------
   ⍝ symbol←alpha seq (alphaNum many)
   number←digit some

   empty←{0=⍴⍵}
   eof←{
     0=⍴⍵: Ok ⍬ ⍵
           Fail ⍬ ⍵
   }

   ⍝ Strings

   ⍝ string : parse a string literal, escaped with the usual C rules.
   ⍝          returns the escaped string
   string←{
     dquote←'"' tok

     stringRest←{
       escape←{
         ⍝ we append ⍵ (the second char in the escape sequence) so that it is
         ⍝ translated to itself when it's not a known escape sequence
         escapes←'trbn',⍵
         trans←(⎕ucs 9 10 8 13),⍵
         1↑(⍵=escapes)/trans
       }

       c←1↑⍵

       empty ⍵:  Fail ⍬ ⍵
       c='\': {(escape y),⍵} map ∇ 2↓⍵
       c='"': Ok '' (1↓⍵)
       {c,⍵} map ∇ 1↓⍵
     }

     {⊃1↓⍵} map (dquote seq stringRest) ⍵
   }

   ⍝ stringNE : parse a string literal, return the string literal itself
   stringNE←{
     dquote←'"' tok

     stringRest←{
       c←1↑⍵
       cc←2↑⍵

       empty ⍵: Fail ⍬ ⍵
       c='\':   {cc,⍵} map ∇ 2↓⍵
       c='"':   Ok '"' (1↓⍵)
                {c,⍵} map ∇ 1↓⍵
     }

     {∊⍵} map (dquote seq stringRest) ⍵
   }

   flat←{{∊⍵} map ⍺⍺ ⍵}

   integer←digit many
   symbol←{
     ((digit many) seq (alpha some) seq (alphaNum many)) flat ⍵
   }

   quote      ← '''' tok
   unquote    ← '~'  tok
   deref      ← '@'  tok
   quasiquote ← '`'  tok
   openParen  ← '('  tok
   semicolon  ← ';'  tok

   read←{⍵}
   eval←{⍵}
   print←{⍵}
   rep←print∘eval∘read
 :EndNamespace
	:Namespace p
	⍝ Parsers take a stream of token and return three values:
	⍝ success word rest ← parser 'my sentence'
	⍝ success: 1 (success) or 0 (failure)

	⍝ Coding convention: parsing result
	⍝ s r R ← parser input
	⍝ s: status, can be either Ok (1) or Fail (0)
	⍝ r: result, any value in case of success, ⍬ in case of failure
	⍝ R: rest, the remaining input to be parsed.

	⍝ Documentation conventions:
	⍝ P : a parser, that is, a function that takes a vector
	⍝ of tokens and returns the triple (s r R)

	Ok←1
	Fail←0

	⍝ pred terminal : scans and returns one token T if (pred T), fails otherwise
	terminal←{ ⍝ ⍵ is an array of tokens, ⍺⍺ a predicate
	⍺⍺ 1↑⍵: Ok (1↑⍵) (1↓⍵) ⍝ if (⍺⍺ ⍵[1]) [Ok w[1] ⍵[1..]]
	Fail ⍬ ⍵ ⍝ else fail empty ⍵
	}

	⍝ implemented as operator so that it is correct to write
	⍝ parser ← 'X' tok
	tok←{(=∘⍺⍺) terminal ⍵}

	⍝ f map p : maps f on the result of p
	map←{ ⍝ ⍵⍵ is a parser function
	s r R←⍵⍵ ⍵ ⍝ ⍵ the usual array of tokens
	s (⍺⍺ r) R ⍝ ⍺⍺ is a function that maps tokens to something else
	}

	⍝ x cmap parser : shorthand for {x} map parser
	cmap←{x←⍺ ◊ {x} map ⍺⍺ ⍵}

	⍝ parser many : builds a new parser that parses 0 or many occurrences of parser
	many←{
	P←⍺⍺ ⍝
	s r R1←⍺⍺ ⍵

	s: {
	s rs R2←P many R1
	Ok (r,rs) R2
	}⍬
	Ok r R1
	}

	seq←{
	p1←⍺⍺
	p2←⍵⍵
	inp←⍵
	s1 r1 R1←p1 inp

	s1:{
	s2 r2 R2←p2 R1

	s2: 1 (r1 r2) R2
	Fail ⍬ inp
	}⍬
	Fail ⍬ inp
	}

	⍝ Can parentheses be simplified?
	some←{((⊃,/) map (⍺⍺ seq (⍺⍺ many))) ⍵}

	alt←{
	s r R←⍺⍺ ⍵

	s: s r R
	⍵⍵ ⍵
	}

	skip←{{⍬} map ⍺⍺ ⍵}

	⍝ Grammar
	⍝ -------

	⍝ Special chars
	NL←⎕ucs 13
	TAB←⎕ucs 9
	SPC←' '
	WS←TAB,SPC
	WSNL←TAB,WS

	isAlpha←{⍵∊'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'}
	isDigit←{⍵∊'0123456789'}
	isAlphaNum←isAlpha∨isDigit

	⍝ Lexing - basic chars
	digit←isDigit terminal
	alpha←isAlpha terminal
	alphaNum←isAlphaNum terminal
	newline←NL tok
	whitespace←(∊∘WSNL) terminal

	⍝ terminals
	⍝ ---------
	⍝ symbol←alpha seq (alphaNum many)
	number←digit some

	empty←{0=⍴⍵}
	eof←{
	0=⍴⍵: Ok ⍬ ⍵
	Fail ⍬ ⍵
	}

	⍝ Strings

	⍝ string : parse a string literal, escaped with the usual C rules.
	⍝ returns the escaped string
	string←{
	dquote←'"' tok

	stringRest←{
	escape←{
	⍝ we append ⍵ (the second char in the escape sequence) so that it is
	⍝ translated to itself when it's not a known escape sequence
	escapes←'trbn',⍵
	trans←(⎕ucs 9 10 8 13),⍵
	1↑(⍵=escapes)/trans
	}

	c←1↑⍵

	empty ⍵: Fail ⍬ ⍵
	c='\': {(escape y),⍵} map ∇ 2↓⍵
	c='"': Ok '' (1↓⍵)
	{c,⍵} map ∇ 1↓⍵
	}

	{⊃1↓⍵} map (dquote seq stringRest) ⍵
	}

	⍝ stringNE : parse a string literal, return the string literal itself
	stringNE←{
	dquote←'"' tok

	stringRest←{
	c←1↑⍵
	cc←2↑⍵

	empty ⍵: Fail ⍬ ⍵
	c='\': {cc,⍵} map ∇ 2↓⍵
	c='"': Ok '"' (1↓⍵)
	{c,⍵} map ∇ 1↓⍵
	}

	{∊⍵} map (dquote seq stringRest) ⍵
	}

	flat←{{∊⍵} map ⍺⍺ ⍵}

	integer←digit many
	symbol←{
	((digit many) seq (alpha some) seq (alphaNum many)) flat ⍵
	}

	quote ← '''' tok
	unquote ← '~' tok
	deref ← '@' tok
	quasiquote ← '`' tok
	openParen ← '(' tok
	semicolon ← ';' tok

	read←{⍵}
	eval←{⍵}
	print←{⍵}
	rep←print∘eval∘read
	:EndNamespace