jirutka/InlineXHtmlPegdownPluginParser.java

## InlineXHtmlPegdownPluginParser.java
import java.util.ArrayList;
import java.util.List;

import org.parboiled.Rule;
import org.parboiled.matchers.FirstOfMatcher;
import org.parboiled.matchers.Matcher;
import org.pegdown.Parser;
import org.pegdown.ast.InlineHtmlNode;
import org.pegdown.plugins.InlinePluginParser;

/**
 * Pluggable parser for Pegdown that parsers an inline XHTML/XML.
 *
 * <p>This is intended to replace the Inline HTML feature of the Pegdown parser which is too lenient
 * and so very hard to escape correctly in renderer. XML syntax is more strict and so more suitable
 * for an inline (X)HTML in the Markdown syntax.</p>
 *
 * <p>There are some restrictions and one relaxation:</p>
 * <ul>
 *     <li>Only tags, attributes and comments are supported.</li>
 *     <li>No new lines are permitted inside tags.</li>
 *     <li>An attribute value may not be quoted when it doesn't contain spaces or '>' characters.</li>
 * </ul>
 */
public class InlineXHtmlPegdownPluginParser extends Parser implements InlinePluginParser
{

    public InlineXHtmlPegdownPluginParser()
    {
        super(ALL, 1000l, DefaultParseRunnerProvider);
    }

    @Override
    public Rule[] inlinePluginRules()
    {
        // HideInlineHtmlRule must be after InlineXHtml and before InlineHtml
        return new Rule[]{ InlineXHtml(), HideInlineHtmlRule() };
    }

    /**
     * Rule that "hides" the {@link Parser#InlineHtml() InlineHtml} rule from the Pegdown parser.
     *
     * <p>This rule tests {@link Parser#HtmlTag() HtmlTag} against an input and when it succeeds,
     * then it tries all of the subrules from the {@link Parser#NonLinkInline() NonLinkInline}
     * except the {@code InlineHtml} rule. This means that the parser never reaches the
     * {@code InlineHtml} rule and an inline HTML is parsed as a normal text.</p>
     *
     * <p>Why so complicated? Parsing of an inline HTML cannot be simply disabled in Pegdown, only
     * suppressed, i.e. HTML is parsed and then dropped. The {@link Parser} class cannot be smoothly
     * subclassed (see <a href="https://github.com/sirthias/pegdown/issues/54"> #54</a>) so there's
     * no a straightforward way how to override the {@code InlineHtml} rule. Therefore this is
     * probably the most elegant way how to do it without patching the Pegdown parser.</p>
     */
    public Rule HideInlineHtmlRule()
    {
        FirstOfMatcher nonLinkInline = (FirstOfMatcher) NonLinkInline();
        List<Matcher> matchers = new ArrayList<Matcher>();

        // Copy all matchers but InlineHtml
        for (Matcher matcher : nonLinkInline.getChildren()) {
            if (! "InlineHtml".equals(matcher.getLabel())) {
                matchers.add(matcher);
            }
        }
        return NodeSequence(
            Test(InlineHtml()),
            // This is basically NonLinkInline rule but without InlineHtml
            FirstOf(matchers.toArray())
        );
    }

    /**
     * Rule for an inline XHTML/XML.
     */
    public Rule InlineXHtml()
    {
        return NodeSequence(
            FirstOf(
                HtmlComment(),  //comments are same in XML and HTML, so reuse it
                XmlTag()
            ), push(new InlineHtmlNode(match()))
        );
    }

    /**
     * Rule for an XML tag.
     * It must not contain new lines.
     */
    public Rule XmlTag()
    {
        return Sequence(
            '<', Optional('/'),
            OneOrMore(
                Alphanumeric()
            ),
            Sp(),
            ZeroOrMore(
                XmlAttribute(),
                Sp()
            ),
            Optional('/'), '>'
        );
    }

    /**
     * Rule for a XML attribute.
     * An attribute value may not be quoted when it doesn't contain spaces or '>' characters.
     */
    public Rule XmlAttribute()
    {
        return Sequence(
            OneOrMore(
                FirstOf(Alphanumeric(), '-', '_')
            ),
            Sp(), '=', Sp(),
            FirstOf(
                Quoted(),
                // Non-quoted value is not valid in XML, but we may not be so strict
                OneOrMore(
                    TestNot('>'),
                    Nonspacechar()
                )
            )
        );
    }
}
	import java.util.ArrayList;
	import java.util.List;

	import org.parboiled.Rule;
	import org.parboiled.matchers.FirstOfMatcher;
	import org.parboiled.matchers.Matcher;
	import org.pegdown.Parser;
	import org.pegdown.ast.InlineHtmlNode;
	import org.pegdown.plugins.InlinePluginParser;

	/**
	* Pluggable parser for Pegdown that parsers an inline XHTML/XML.
	*
	* <p>This is intended to replace the Inline HTML feature of the Pegdown parser which is too lenient
	* and so very hard to escape correctly in renderer. XML syntax is more strict and so more suitable
	* for an inline (X)HTML in the Markdown syntax.</p>
	*
	* <p>There are some restrictions and one relaxation:</p>
	* <ul>
	* <li>Only tags, attributes and comments are supported.</li>
	* <li>No new lines are permitted inside tags.</li>
	* <li>An attribute value may not be quoted when it doesn't contain spaces or '>' characters.</li>
	* </ul>
	*/
	public class InlineXHtmlPegdownPluginParser extends Parser implements InlinePluginParser
	{

	public InlineXHtmlPegdownPluginParser()
	{
	super(ALL, 1000l, DefaultParseRunnerProvider);
	}

	@Override
	public Rule[] inlinePluginRules()
	{
	// HideInlineHtmlRule must be after InlineXHtml and before InlineHtml
	return new Rule[]{ InlineXHtml(), HideInlineHtmlRule() };
	}

	/**
	* Rule that "hides" the {@link Parser#InlineHtml() InlineHtml} rule from the Pegdown parser.
	*
	* <p>This rule tests {@link Parser#HtmlTag() HtmlTag} against an input and when it succeeds,
	* then it tries all of the subrules from the {@link Parser#NonLinkInline() NonLinkInline}
	* except the {@code InlineHtml} rule. This means that the parser never reaches the
	* {@code InlineHtml} rule and an inline HTML is parsed as a normal text.</p>
	*
	* <p>Why so complicated? Parsing of an inline HTML cannot be simply disabled in Pegdown, only
	* suppressed, i.e. HTML is parsed and then dropped. The {@link Parser} class cannot be smoothly
	* subclassed (see <a href="https://github.com/sirthias/pegdown/issues/54"> #54</a>) so there's
	* no a straightforward way how to override the {@code InlineHtml} rule. Therefore this is
	* probably the most elegant way how to do it without patching the Pegdown parser.</p>
	*/
	public Rule HideInlineHtmlRule()
	{
	FirstOfMatcher nonLinkInline = (FirstOfMatcher) NonLinkInline();
	List<Matcher> matchers = new ArrayList<Matcher>();

	// Copy all matchers but InlineHtml
	for (Matcher matcher : nonLinkInline.getChildren()) {
	if (! "InlineHtml".equals(matcher.getLabel())) {
	matchers.add(matcher);
	}
	}
	return NodeSequence(
	Test(InlineHtml()),
	// This is basically NonLinkInline rule but without InlineHtml
	FirstOf(matchers.toArray())
	);
	}

	/**
	* Rule for an inline XHTML/XML.
	*/
	public Rule InlineXHtml()
	{
	return NodeSequence(
	FirstOf(
	HtmlComment(), //comments are same in XML and HTML, so reuse it
	XmlTag()
	), push(new InlineHtmlNode(match()))
	);
	}

	/**
	* Rule for an XML tag.
	* It must not contain new lines.
	*/
	public Rule XmlTag()
	{
	return Sequence(
	'<', Optional('/'),
	OneOrMore(
	Alphanumeric()
	),
	Sp(),
	ZeroOrMore(
	XmlAttribute(),
	Sp()
	),
	Optional('/'), '>'
	);
	}

	/**
	* Rule for a XML attribute.
	* An attribute value may not be quoted when it doesn't contain spaces or '>' characters.
	*/
	public Rule XmlAttribute()
	{
	return Sequence(
	OneOrMore(
	FirstOf(Alphanumeric(), '-', '_')
	),
	Sp(), '=', Sp(),
	FirstOf(
	Quoted(),
	// Non-quoted value is not valid in XML, but we may not be so strict
	OneOrMore(
	TestNot('>'),
	Nonspacechar()
	)
	)
	);
	}
	}