conartist6/cstml.js

## cstml.js
/*
This file contains a formal grammar defined by yielding instructions to a state machine.
The grammar is extensible, because you can always wrap it in a higher-order grammar!
Formally the system is a VM exectuting an https://en.wikipedia.org/wiki/Earley_parser
The VM is not yet sufficiently finished to execute this grammar, but it soon will be.

Usage will be:

```js
import { parse } from 'cst-tokens';
import { grammar } from 'https://url/to/grammar';

result = parse(grammar, `2 + 2`);
```

For some estree-ish definition of `grammar`, `result` might be:

```cstml
<!doctype cstml>
<cstml validate="https://url/to/grammar">
  <BinaryExpression [Expression]>
    <NumericLiteral [Expression] path="left">
      <| Digits "2" |>
    </>
    <| Trivia " " |>
    <| Punctuator "+" path="operator" |>
    <| Trivia " " |>
    <[Expression] path="right"/>
  </>
</cstml>
```

The VM guarantees that `print(parse(grammar, text))` will be exactly `text`
*/

import { str as strFrom } from 'https://esm.sh/iter-tools-es@7.5.3/methods/str';
import { map } from 'https://esm.sh/iter-tools-es@7.5.3/methods/map';
import { objectEntries } from 'https://esm.sh/@cst-tokens/helpers@0.14.0/object';
import { escapeCharacterClass } from 'https://esm.sh/@cst-tokens/helpers@0.14.0/regex';
import * as productions from 'https://esm.sh/@cst-tokens/helpers@0.14.0/productions';
import * as sym from 'https://esm.sh/@cst-tokens/helpers@0.14.0/symbols';
import { i } from 'https://esm.sh/@cst-tokens/helpers@0.14.0/shorthand';

// Mostly borrowed from JSON
const escapables = new Map(
  objectEntries({
    '"': '"',
    "'": "'",
    '\\': '\\',
    '/': '/',
    b: '\b',
    f: '\f',
    n: '\n',
    r: '\r',
    t: '\t',
  }),
);

class NodeGrammar {
  @InjectFrom(productions)
  List() {}

  @Node
  *Document() {
    yield i`eatMatch(<| |>)`;
    yield i`eat(<DoctypeTag>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<Parsers>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<Node>)`;
    yield i`eatMatch(<| |>)`;
  }

  @Node
  @Cover('Tag')
  *DoctypeTag() {
    yield i`eat(<| Punctuator '<' startSpan='Tag' balanced='>' |>)`;
    yield i`eat(<| Punctuator '!' |>)`;
    yield i`eat(<| Keyword 'doctype' |>)`;
    yield i`eat(<| |>)`;
    yield i`eat(<| Keyword 'cstml' |>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<| Punctuator '>' balanced |>)`;
  }

  *Parsers() {
    yield i`eat(<ParsersOpenTag>)`;
    while (i`eatMatch([ <| |> <ParserTag> ])`);
    yield i`eatMatch(<| |>)`;
    yield i`eat(<ParsersCloseTag>)`;
  }

  @Node
  @Cover('Tag')
  *ParsersOpenTag() {
    yield i`eat(<| Punctuator '<' startSpan='Tag' balanced='>' |>)`;
    yield i`eat(<| Punctuator '!' |>)`;
    yield i`eat(<| Keyword 'parsers' |>)`;
    yield i`eat(<| Punctuator '>' balanced |>)`;
  }

  @Node
  @Cover('Tag')
  *ParserTag() {
    yield i`eat(<| Punctuator '<' startSpan='Tag' balanced='>' |>)`;
    yield i`eat(<Identifier path='name'>)`;
    yield i`eat(<| |>)`;
    yield i`eat(<String path='href'>)`;
    yield i`eat(<| Punctuator '>' balanced |>)`;
  }

  @Node
  @Cover('Tag')
  *ParsersCloseTag() {
    yield i`eat(<| Punctuator '</' startSpan='Tag' balanced='>' |>)`;
    yield i`eatMatch(<Keyword value='parsers' path='type'>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<| Punctuator '>' balanced |>)`;
  }

  *Fragment() {
    while (yield i`eatMatch([ <| |> || <Element guard='<'> ])`);
  }

  *Element({ attrs }) {
    const [tag] = yield i`eat(<Tag ${attrs}>)`;
    if (tag.type === 'NodeOpenTag') {
      yield i`eat(<Fragment>)`;
      yield i`eat(<NodeCloseTag type=${tag.value.type}>)`;
    } else if (tag.type === 'NodeCloseTag') {
      yield i`fail()`;
    }
  }

  *Tag() {
    yield i`eat([
        <TokenGapTag { guard: '<|[' }> ||
        <TokenTag { guard: '<|' }> ||
        <NodeGapTag { guard: '<[' }> ||
        <NodeOpenTag { guard: '<' }> ||
        <NodeCloseTag { guard: '</' }>
      ])`;
  }

  *Node() {
    const openTag = yield i`eat(<NodeOpenTag>)`;
    yield i`eat(<Fragment>)`;
    yield i`eat(<NodeCloseTag type=${openTag.value.type}>)`;
  }

  @Node
  @Cover('Tag')
  *NodeOpenTag() {
    yield i`eat(<| Punctuator '<' startSpan='Tag' balanced='>' |>)`;

    if (yield i`eatMatch([ <| Identifier |> <| Punctuator ':' |> ])`) {
      yield i`eat(<Identifier path='language'>)`;
      yield i`eat(<| Punctuator ':' |>)`;
      yield i`eat(<Identifier path='type'>)`;
    } else {
      yield i`eat(<Identifier path='type'>)`;
    }

    const gapOpen = yield i`eatMatch([
        <| |>
        <| Punctuator '[' startSpan='Gap' balanced=']' |>
      ])`;

    if (gapOpen) {
      yield i`eatMatch(<| |>)`;
      yield i`eat(<Identifier path='gapType'>)`;
      yield i`eatMatch(<| |>)`;
      yield i`eat(<| Punctuator ']' balanced |>)`;
    }

    yield i`eatMatch(<Attributes>)`;

    yield i`eat(<| Punctuator '>' balanced |>)`;
  }

  @Node
  @Cover('Tag')
  *NodeCloseTag({ attrs }) {
    yield i`eat(<| Punctuator '</' startSpan='Tag' balanced='>' |>)`;
    yield i`eatMatch(<Identifier value=${attrs.get('type')} path='type'>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<| Punctuator '>' balanced |>)`;
  }

  @Node
  @Cover('Tag')
  *TokenTag() {
    yield i`eat(<| Punctuator '<|' startSpan='Tag' balanced='|>' |>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<Identifier path='type'>)`;
    yield i`eatMatch(<Attributes>)`;
    yield i`eat(<| Punctuator '|>' balanced |>)`;
  }

  @Node
  @Cover('Tag')
  *NodeGapTag() {
    yield i`eat(<| Punctuator '<' startSpan='Tag' balanced='>' |>)`;

    yield i`eat(<| Punctuator '[' startSpan='Gap' balanced=']' |>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<Identifier path='type'>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<| Punctuator ']' balanced |>)`;
    yield i`eatMatch(<Attributes>)`;
    yield i`eat(<| Punctuator '/>' balanced |>)`;
  }

  @Node
  @Cover('Tag')
  *TokenGapTag() {
    yield i`eat(<| Punctuator '<|' startSpan='Tag' balanced='|>' |>)`;
    yield i`eat(<| Punctuator '[' startSpan='Gap' balanced=']' |>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<Identifier path='type'>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<| Punctuator ']' balanced |>)`;
    yield i`eatMatch([ <| |> <| String |> ])`;
    yield i`eatMatch(<Attributes>)`;
    yield i`eat(<| Punctuator '|>' balanced |>)`;
  }

  *Attributes() {
    yield i`eatMatch([
        <| |>
        <List { separator=<| |> matchable=<Attribute path='attrs'> }>
      ])`;
  }

  @Node
  *Attribute() {
    yield i`eat(<| Identifier |>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<| Punctuator '=' |>)`;
    yield i`eatMatch(<| |>)`;
    yield i`eat(<String>)`;
  }

  *Identifier({ attrs }) {
    yield i`eat(<| Identifier ${attrs.get('value')} |>)`;
  }

  *String() {
    yield i`eat(<| String |>)`;
  }
}

class TokenGrammar {
  @Token
  *Keyword({ value }) {
    yield i`eat(${value})`;
  }

  @Token
  *Punctuator({ value }) {
    yield i`eat(${value})`;
  }

  @Token
  *Identifier({ value }) {
    const result = yield value ? i`eat(${value})` : i`eat(/\w+/y)`;
    if (result && !value && !/\w+/y.test(result)) {
      throw new Error('value can only match valid identifiers');
    }
  }

  @Token
  *Literal({ state: { span } }) {
    if (span === 'String:Single') {
      yield i`eat(/[^'\n]+/y)`;
    } else if (span === 'String:Double') {
      yield i`eat(/[^"\n]+/y)`;
    } else {
      throw new Error(`{span: ${span}} does not allow literals`);
    }
  }

  *EscapeSequence({ state: { span } }) {
    if (!span.startsWith('String')) {
      throw new Error(`{span: ${span}} does not define an escape sequence`);
    }

    yield i`guard('\\')`;

    yield i`eat(<| Escape |>)`;
    yield i`eat(<| EscapeCode |>)`;
  }

  @Token
  *Escape({ state: { span } }) {
    if (span.startsWith('String')) {
      throw new Error(`{span: ${span}} does not define an escape`);
    }

    yield i`eat('\\')`;
  }

  @Token
  *EscapeCode({ state: { span } }) {
    if (!span.startsWith('String')) {
      throw new Error(`{span: ${span}} does not define any escape codes`);
    }

    if (yield i`eatMatch(/u{\d{1,6}}/y)`) {
      // break
    } else if (yield i`eatMatch(/u\d\d\d\d/y)`) {
      // break
    } else if (span !== 'Bare') {
      if (yield i`eatMatch(/[${strFrom(map(escapeCharacterClass, escapables.keys()))}/)`) {
        // break
      }
    }
  }

  *String() {
    let lq = i`eat([
        <| Punctuator "'" startSpan='String:Single' balanced="'" |> ||
        <| Punctuator '"' startSpan='String:Double' balanced='"' |>
      ])`;

    while (yield i`eatMatch([ <| Literal |> || <| EscapeSequence |> ])`);

    yield i`eat(<| Punctuator ${lq.value} balanced |>)`;
  }

  @Token
  *Trivia({ state }) {
    const { span } = state;
    if (span === 'Bare' || span === 'Tag') {
      yield i`eat(/\s+/y)`;
    } else if (span === 'TokenTag') {
      yield i`eat(/[ \t]+/y)`;
    } else {
      throw new Error(`Trivia not supported in {span ${span}}`);
    }
  }
}

export const grammars = {
  [sym.node]: NodeGrammar,
  [sym.token]: TokenGrammar,
};
	/*
	This file contains a formal grammar defined by yielding instructions to a state machine.
	The grammar is extensible, because you can always wrap it in a higher-order grammar!
	Formally the system is a VM exectuting an https://en.wikipedia.org/wiki/Earley_parser
	The VM is not yet sufficiently finished to execute this grammar, but it soon will be.

	Usage will be:

	```js
	import { parse } from 'cst-tokens';
	import { grammar } from 'https://url/to/grammar';

	result = parse(grammar, `2 + 2`);
	```

	For some estree-ish definition of `grammar`, `result` might be:

	```cstml
	<!doctype cstml>
	<cstml validate="https://url/to/grammar">
	<BinaryExpression [Expression]>
	<NumericLiteral [Expression] path="left">
	<\| Digits "2" \|>
	</>
	<\| Trivia " " \|>
	<\| Punctuator "+" path="operator" \|>
	<\| Trivia " " \|>
	<[Expression] path="right"/>
	</>
	</cstml>
	```

	The VM guarantees that `print(parse(grammar, text))` will be exactly `text`
	*/

	import { str as strFrom } from 'https://esm.sh/iter-tools-es@7.5.3/methods/str';
	import { map } from 'https://esm.sh/iter-tools-es@7.5.3/methods/map';
	import { objectEntries } from 'https://esm.sh/@cst-tokens/helpers@0.14.0/object';
	import { escapeCharacterClass } from 'https://esm.sh/@cst-tokens/helpers@0.14.0/regex';
	import * as productions from 'https://esm.sh/@cst-tokens/helpers@0.14.0/productions';
	import * as sym from 'https://esm.sh/@cst-tokens/helpers@0.14.0/symbols';
	import { i } from 'https://esm.sh/@cst-tokens/helpers@0.14.0/shorthand';

	// Mostly borrowed from JSON
	const escapables = new Map(
	objectEntries({
	'"': '"',
	"'": "'",
	'\\': '\\',
	'/': '/',
	b: '\b',
	f: '\f',
	n: '\n',
	r: '\r',
	t: '\t',
	}),
	);

	class NodeGrammar {
	@InjectFrom(productions)
	List() {}

	@Node
	*Document() {
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<DoctypeTag>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<Parsers>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<Node>)`;
	yield i`eatMatch(<\| \|>)`;
	}

	@Node
	@Cover('Tag')
	*DoctypeTag() {
	yield i`eat(<\| Punctuator '<' startSpan='Tag' balanced='>' \|>)`;
	yield i`eat(<\| Punctuator '!' \|>)`;
	yield i`eat(<\| Keyword 'doctype' \|>)`;
	yield i`eat(<\| \|>)`;
	yield i`eat(<\| Keyword 'cstml' \|>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<\| Punctuator '>' balanced \|>)`;
	}

	*Parsers() {
	yield i`eat(<ParsersOpenTag>)`;
	while (i`eatMatch([ <\| \|> <ParserTag> ])`);
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<ParsersCloseTag>)`;
	}

	@Node
	@Cover('Tag')
	*ParsersOpenTag() {
	yield i`eat(<\| Punctuator '<' startSpan='Tag' balanced='>' \|>)`;
	yield i`eat(<\| Punctuator '!' \|>)`;
	yield i`eat(<\| Keyword 'parsers' \|>)`;
	yield i`eat(<\| Punctuator '>' balanced \|>)`;
	}

	@Node
	@Cover('Tag')
	*ParserTag() {
	yield i`eat(<\| Punctuator '<' startSpan='Tag' balanced='>' \|>)`;
	yield i`eat(<Identifier path='name'>)`;
	yield i`eat(<\| \|>)`;
	yield i`eat(<String path='href'>)`;
	yield i`eat(<\| Punctuator '>' balanced \|>)`;
	}

	@Node
	@Cover('Tag')
	*ParsersCloseTag() {
	yield i`eat(<\| Punctuator '</' startSpan='Tag' balanced='>' \|>)`;
	yield i`eatMatch(<Keyword value='parsers' path='type'>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<\| Punctuator '>' balanced \|>)`;
	}

	*Fragment() {
	while (yield i`eatMatch([ <\| \|> \|\| <Element guard='<'> ])`);
	}

	*Element({ attrs }) {
	const [tag] = yield i`eat(<Tag ${attrs}>)`;
	if (tag.type === 'NodeOpenTag') {
	yield i`eat(<Fragment>)`;
	yield i`eat(<NodeCloseTag type=${tag.value.type}>)`;
	} else if (tag.type === 'NodeCloseTag') {
	yield i`fail()`;
	}
	}

	*Tag() {
	yield i`eat([
	<TokenGapTag { guard: '<\|[' }> \|\|
	<TokenTag { guard: '<\|' }> \|\|
	<NodeGapTag { guard: '<[' }> \|\|
	<NodeOpenTag { guard: '<' }> \|\|
	<NodeCloseTag { guard: '</' }>
	])`;
	}

	*Node() {
	const openTag = yield i`eat(<NodeOpenTag>)`;
	yield i`eat(<Fragment>)`;
	yield i`eat(<NodeCloseTag type=${openTag.value.type}>)`;
	}

	@Node
	@Cover('Tag')
	*NodeOpenTag() {
	yield i`eat(<\| Punctuator '<' startSpan='Tag' balanced='>' \|>)`;

	if (yield i`eatMatch([ <\| Identifier \|> <\| Punctuator ':' \|> ])`) {
	yield i`eat(<Identifier path='language'>)`;
	yield i`eat(<\| Punctuator ':' \|>)`;
	yield i`eat(<Identifier path='type'>)`;
	} else {
	yield i`eat(<Identifier path='type'>)`;
	}

	const gapOpen = yield i`eatMatch([
	<\| \|>
	<\| Punctuator '[' startSpan='Gap' balanced=']' \|>
	])`;

	if (gapOpen) {
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<Identifier path='gapType'>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<\| Punctuator ']' balanced \|>)`;
	}

	yield i`eatMatch(<Attributes>)`;

	yield i`eat(<\| Punctuator '>' balanced \|>)`;
	}

	@Node
	@Cover('Tag')
	*NodeCloseTag({ attrs }) {
	yield i`eat(<\| Punctuator '</' startSpan='Tag' balanced='>' \|>)`;
	yield i`eatMatch(<Identifier value=${attrs.get('type')} path='type'>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<\| Punctuator '>' balanced \|>)`;
	}

	@Node
	@Cover('Tag')
	*TokenTag() {
	yield i`eat(<\| Punctuator '<\|' startSpan='Tag' balanced='\|>' \|>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<Identifier path='type'>)`;
	yield i`eatMatch(<Attributes>)`;
	yield i`eat(<\| Punctuator '\|>' balanced \|>)`;
	}

	@Node
	@Cover('Tag')
	*NodeGapTag() {
	yield i`eat(<\| Punctuator '<' startSpan='Tag' balanced='>' \|>)`;

	yield i`eat(<\| Punctuator '[' startSpan='Gap' balanced=']' \|>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<Identifier path='type'>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<\| Punctuator ']' balanced \|>)`;
	yield i`eatMatch(<Attributes>)`;
	yield i`eat(<\| Punctuator '/>' balanced \|>)`;
	}

	@Node
	@Cover('Tag')
	*TokenGapTag() {
	yield i`eat(<\| Punctuator '<\|' startSpan='Tag' balanced='\|>' \|>)`;
	yield i`eat(<\| Punctuator '[' startSpan='Gap' balanced=']' \|>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<Identifier path='type'>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<\| Punctuator ']' balanced \|>)`;
	yield i`eatMatch([ <\| \|> <\| String \|> ])`;
	yield i`eatMatch(<Attributes>)`;
	yield i`eat(<\| Punctuator '\|>' balanced \|>)`;
	}

	*Attributes() {
	yield i`eatMatch([
	<\| \|>
	<List { separator=<\| \|> matchable=<Attribute path='attrs'> }>
	])`;
	}

	@Node
	*Attribute() {
	yield i`eat(<\| Identifier \|>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<\| Punctuator '=' \|>)`;
	yield i`eatMatch(<\| \|>)`;
	yield i`eat(<String>)`;
	}

	*Identifier({ attrs }) {
	yield i`eat(<\| Identifier ${attrs.get('value')} \|>)`;
	}

	*String() {
	yield i`eat(<\| String \|>)`;
	}
	}

	class TokenGrammar {
	@Token
	*Keyword({ value }) {
	yield i`eat(${value})`;
	}

	@Token
	*Punctuator({ value }) {
	yield i`eat(${value})`;
	}

	@Token
	*Identifier({ value }) {
	const result = yield value ? i`eat(${value})` : i`eat(/\w+/y)`;
	if (result && !value && !/\w+/y.test(result)) {
	throw new Error('value can only match valid identifiers');
	}
	}

	@Token
	*Literal({ state: { span } }) {
	if (span === 'String:Single') {
	yield i`eat(/[^'\n]+/y)`;
	} else if (span === 'String:Double') {
	yield i`eat(/[^"\n]+/y)`;
	} else {
	throw new Error(`{span: ${span}} does not allow literals`);
	}
	}

	*EscapeSequence({ state: { span } }) {
	if (!span.startsWith('String')) {
	throw new Error(`{span: ${span}} does not define an escape sequence`);
	}

	yield i`guard('\\')`;

	yield i`eat(<\| Escape \|>)`;
	yield i`eat(<\| EscapeCode \|>)`;
	}

	@Token
	*Escape({ state: { span } }) {
	if (span.startsWith('String')) {
	throw new Error(`{span: ${span}} does not define an escape`);
	}

	yield i`eat('\\')`;
	}

	@Token
	*EscapeCode({ state: { span } }) {
	if (!span.startsWith('String')) {
	throw new Error(`{span: ${span}} does not define any escape codes`);
	}

	if (yield i`eatMatch(/u{\d{1,6}}/y)`) {
	// break
	} else if (yield i`eatMatch(/u\d\d\d\d/y)`) {
	// break
	} else if (span !== 'Bare') {
	if (yield i`eatMatch(/[${strFrom(map(escapeCharacterClass, escapables.keys()))}/)`) {
	// break
	}
	}
	}

	*String() {
	let lq = i`eat([
	<\| Punctuator "'" startSpan='String:Single' balanced="'" \|> \|\|
	<\| Punctuator '"' startSpan='String:Double' balanced='"' \|>
	])`;

	while (yield i`eatMatch([ <\| Literal \|> \|\| <\| EscapeSequence \|> ])`);

	yield i`eat(<\| Punctuator ${lq.value} balanced \|>)`;
	}

	@Token
	*Trivia({ state }) {
	const { span } = state;
	if (span === 'Bare' \|\| span === 'Tag') {
	yield i`eat(/\s+/y)`;
	} else if (span === 'TokenTag') {
	yield i`eat(/[ \t]+/y)`;
	} else {
	throw new Error(`Trivia not supported in {span ${span}}`);
	}
	}
	}

	export const grammars = {
	[sym.node]: NodeGrammar,
	[sym.token]: TokenGrammar,
	};