Munawwar/html-validator.js

## html-validator.js
/**
 * Detect unsafe (and potentially unsafe) unbalanced tags in a given HTML snippet.
 * Hints taken from an html parse (https://gist.github.com/cburgmer/2877758).
 *
 * Example:
 * An unclosed div tag is considered unsafe, because if the snippet is pasted in between two div tags
 * then it could end up breaking the HTML document.
 * Self closing tags (tags that you can intentioanlly leave open like <table><tr><td>some text</table>) are also considered unsafe, for the same reason.
 * However an unclosed void tag (like meta tag) is safe, because browsers will ignore it without any side effects.
 *
 * Usage: ValidateHtml('<html string>');
 */
(function (root, factory) {
    if (typeof define === "function" && define.amd) {
        define(factory);
    } else if (typeof exports === 'object') { //For NodeJS
        module.exports = factory();
    } else { //For browsers
        root.ValidateHtml = factory();
    }
}(this, function () {
    function unwrap(str) {
        var arr = str.split(','), val, o = {};
        while ((val = arr.pop())) {
            o[val] = true;
        }
        return o;
    }

    function ERROR(status, msg) {
        var arg = Array.prototype.slice.call(arguments, 2);
        msg = msg.replace(/(^|[^\\])\{(\w+)\}/g, function (m, p, index) {
            var x = arg[index];
            return (p || '') + (x !== undefined ? x : '');
        });
        return {
            status: status,
            message: msg
        };
    }

    //HTML 4 and 5 void tags
    var voidTags = unwrap('area,base,basefont,br,col,command,embed,frame,hr,img,input,keygen,link,meta,param,source,track,wbr'),
        singlelevel = unwrap('script,style'),
        regxstr = {
            tagname: "[\\-A-Za-z0-9_:]+",
            attrname: "[\\w\\-]+",
            attrvalue: (/(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+)/).toString().slice(1, -1) //quoted and unquoted strings
        },
        regx = {
            // Start tag regex: /[^<]*<([\-A-Za-z0-9_]+:)(?:\s+[\w\-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*[^>]*>(.*)/,
            opentag: new RegExp('^[^<]*?<(' + regxstr.tagname + ')' +
                '(?:\\s+' + regxstr.attrname +
                    '(?:\\s*=\\s*' + regxstr.attrvalue + ')?' +
                ')*' +
                '([^>]*?)>((?:.|\\n)*)'),
            othertag: /^[^<]*?<([!\-\[\]\/A-Za-z0-9_:]+)([^>]*?)>((?:.|\n)*)/, //close tags, doctype, comments, cdata
            comment: /^[^<]*?<!--(?:.|\n)*?-->/,
            cdata: /^[^<]*?<!\[CDATA\[(?:.|\n)*?\]\]>/
        };
    return function (html) {
        var str = html.replace(/[\r]/g, '').trim(),
            tag, rawTag, isCloseTag,
            matches, stack = [], lineNumber, tagStartLineNumber = 1, tagEndLineNumber = 1, last,
            broken = '', level = 0,
            replaceComment = function (m) {
                tagEndLineNumber = tagStartLineNumber + (m.substr(m.indexOf('<')).match(/\n/g) || []).length;
                //prepare for the next tag.
                tagStartLineNumber = tagEndLineNumber;
                return '';
            },
            replaceSingleLevel = function (m) {
                tagEndLineNumber = tagStartLineNumber + (m.match(/\n/g) || []).length;
                //prepare for the next tag.
                tagStartLineNumber = tagEndLineNumber;
                return '';
            },
            pos;
        while (str) {
            matches = (str.match(regx.opentag) || str.match(regx.othertag));
            if (!matches) {
                pos = str.indexOf('<');
                if (pos >= 0) {
                    //add new lines.
                    tagStartLineNumber += (str.substr(0, pos).match(/\n/g) || []).length;
                    str = str.substr(pos + 1);
                    continue;
                }
                break;
            }
            rawTag = matches[1];
            tag = rawTag.toLowerCase(); //html is case insensitive

            tagStartLineNumber += (str.substring(0, str.indexOf('<')).match(/\n/g) || []).length;
            tagEndLineNumber += (str.substring(0, str.length - matches[3].length).match(/\n/g) || []).length;
            lineNumber = tagStartLineNumber;

            str = matches[3];

            //Identify close tag
            if (tag[0] === '/') {
                isCloseTag = true;
                tag = tag.substr(1);
            } else {
                isCloseTag = false;
            }

            //Do something per tag
            if (tag[0] === '!') {
                //Either doctype or comment, so ignore them
                if (tag.indexOf('![cdata[') === 0) {
                    if (!regx.cdata.test(matches[0])) {
                        broken = ERROR('CDATANotClosed', 'Line {0}: CDATA section not closed properly.', lineNumber);
                        break;
                    }
                    str = matches[0].replace(regx.cdata, replaceComment);
                } else if (tag.indexOf('!--') === 0) {
                    if (!regx.comment.test(matches[0])) {
                        broken = ERROR('CommentNotClosed', 'Line {0}: HTML comment not closed properly.', lineNumber);
                        break;
                    }
                    str = matches[0].replace(regx.comment, replaceComment);
                }
                continue;
            } else if (voidTags[tag]) {
                continue;
            } else if (singlelevel[tag]) {
                //prepare for counting the \n between start of tag and end angle bracket of end tag
                tagStartLineNumber = tagEndLineNumber;
                //remove everything upto end tag
                var specialEndTagRegex = new RegExp("^((?:.|\\n)*?)</" + tag + "[^>]*>");
                if (!specialEndTagRegex.test(str.toLowerCase())) {
                    broken = ERROR('MissingEndTag', 'Line {0}: {1} start tag missing corresponding end tag.', lineNumber, '<' + tag + '>');
                    break;
                }
                str = str.replace(specialEndTagRegex, replaceSingleLevel);
                continue;
            }

            if (isCloseTag) {
                level -= 1;
            }
            if (level < 0) {
                broken = ERROR('ExtraTag', 'Line {0}: Extra end tag found: {1}', lineNumber, '<' + rawTag + '>');
                break;
            }
            if (!isCloseTag) {
                level += 1;
            }

            if (!isCloseTag) {
                stack.push({
                    tag: tag,
                    line: lineNumber
                });
            } else {
                last = stack[stack.length - 1];
                if (last.tag !== tag) {
                    pos = -1;
                    stack.some(function (o, index) {
                        if (o.tag === tag) {
                            pos = index;
                            return true;
                        }
                    });
                    if (pos < 0) {
                        broken = ERROR('ExtraTag', 'Line {0}: Extra end tag found: {1}', lineNumber, '<' + rawTag + '>');
                    } else {
                        broken = ERROR('WrongTag', 'Line {0}: {1} start tag from line {2} should be closed before {3}.', lineNumber,
                            '<' + last.tag + '>', last.line, '<' + rawTag + '>');
                    }
                    break;
                }
                stack.pop();
            }

            //Prepare for next tag.
            tagStartLineNumber = tagEndLineNumber;
        }
        if (!broken && stack.length > 0) {
            last = stack[stack.length - 1];
            broken = ERROR('MissingEndTag', 'Line {0}: {1} start tag missing corresponding end tag.', last.line, '<' + last.tag + '>');
        }
        return broken ? broken : true;
    };
}));

## test.html
<!DOCTYPE html>
<html>
    <head>
        <!--
            Test this a multi line comment. >>
        -->
        <meta title="unclosed meta is ok. unclosed div isn't ok."></meta>
        <script>
            var script = '<script>';
        </script>
        <![CDATA[This is a cdata
            section]]>
    </head>
    <body>
        <input type=text data-extra="text"></input>
        <
            div>
        <div>
            <div></div >
        </div>
        <x:blah></x:blah>
        <x-custom></x-custom>
    </body>
</html>
</html>

## validator-test.html
<!DOCTYPE html>
<html>
    <head>
        <script src="html-validator.js"></script>
    </head>
    <body>
        <script>
            var xhr = new XMLHttpRequest();
            xhr.open('GET', 'test.html', false);
            xhr.send();

            console.log(ValidateHtml(xhr.responseText));
        </script>
    </body>
</html>

## validator.html
<!DOCTYPE HTML>
<html>
	<head>
		<script src="html-validator.js"></script>
	</head>
	<body>
		<textarea id="input" rows="25" style="width:100%"></textarea>
		<textarea id="output" readonly rows="5" style="width:100%"></textarea>
		<button id="btn">Valdate HTML</button>
		<script type="text/javascript">
		(function () {
			var input = document.querySelector('#input'),
				output = document.querySelector('#output');
			document.querySelector('#btn').onclick = function () {
				output.value = JSON.stringify(window.ValidateHtml(input.value), null, 2);
			};
		}());
		</script>
	</body>
</html>
	/**
	* Detect unsafe (and potentially unsafe) unbalanced tags in a given HTML snippet.
	* Hints taken from an html parse (https://gist.github.com/cburgmer/2877758).
	*
	* Example:
	* An unclosed div tag is considered unsafe, because if the snippet is pasted in between two div tags
	* then it could end up breaking the HTML document.
	* Self closing tags (tags that you can intentioanlly leave open like <table><tr><td>some text</table>) are also considered unsafe, for the same reason.
	* However an unclosed void tag (like meta tag) is safe, because browsers will ignore it without any side effects.
	*
	* Usage: ValidateHtml('<html string>');
	*/
	(function (root, factory) {
	if (typeof define === "function" && define.amd) {
	define(factory);
	} else if (typeof exports === 'object') { //For NodeJS
	module.exports = factory();
	} else { //For browsers
	root.ValidateHtml = factory();
	}
	}(this, function () {
	function unwrap(str) {
	var arr = str.split(','), val, o = {};
	while ((val = arr.pop())) {
	o[val] = true;
	}
	return o;
	}

	function ERROR(status, msg) {
	var arg = Array.prototype.slice.call(arguments, 2);
	msg = msg.replace(/(^\|[^\\])\{(\w+)\}/g, function (m, p, index) {
	var x = arg[index];
	return (p \|\| '') + (x !== undefined ? x : '');
	});
	return {
	status: status,
	message: msg
	};
	}

	//HTML 4 and 5 void tags
	var voidTags = unwrap('area,base,basefont,br,col,command,embed,frame,hr,img,input,keygen,link,meta,param,source,track,wbr'),
	singlelevel = unwrap('script,style'),
	regxstr = {
	tagname: "[\\-A-Za-z0-9_:]+",
	attrname: "[\\w\\-]+",
	attrvalue: (/(?:(?:"[^"]")\|(?:'[^']')\|[^>\s]+)/).toString().slice(1, -1) //quoted and unquoted strings
	},
	regx = {
	// Start tag regex: /[^<]<([\-A-Za-z0-9_]+:)(?:\s+[\w\-]+(?:\s=\s(?:(?:"[^"]")\|(?:'[^']')\|[^>\s]+))?)[^>]>(.)/,
	opentag: new RegExp('^[^<]*?<(' + regxstr.tagname + ')' +
	'(?:\\s+' + regxstr.attrname +
	'(?:\\s=\\s' + regxstr.attrvalue + ')?' +
	')*' +
	'([^>]?)>((?:.\|\\n))'),
	othertag: /^[^<]?<([!\-\[\]\/A-Za-z0-9_:]+)([^>]?)>((?:.\|\n)*)/, //close tags, doctype, comments, cdata
	comment: /^[^<]?<!--(?:.\|\n)?-->/,
	cdata: /^[^<]?<!\[CDATA\[(?:.\|\n)?\]\]>/
	};
	return function (html) {
	var str = html.replace(/[\r]/g, '').trim(),
	tag, rawTag, isCloseTag,
	matches, stack = [], lineNumber, tagStartLineNumber = 1, tagEndLineNumber = 1, last,
	broken = '', level = 0,
	replaceComment = function (m) {
	tagEndLineNumber = tagStartLineNumber + (m.substr(m.indexOf('<')).match(/\n/g) \|\| []).length;
	//prepare for the next tag.
	tagStartLineNumber = tagEndLineNumber;
	return '';
	},
	replaceSingleLevel = function (m) {
	tagEndLineNumber = tagStartLineNumber + (m.match(/\n/g) \|\| []).length;
	//prepare for the next tag.
	tagStartLineNumber = tagEndLineNumber;
	return '';
	},
	pos;
	while (str) {
	matches = (str.match(regx.opentag) \|\| str.match(regx.othertag));
	if (!matches) {
	pos = str.indexOf('<');
	if (pos >= 0) {
	//add new lines.
	tagStartLineNumber += (str.substr(0, pos).match(/\n/g) \|\| []).length;
	str = str.substr(pos + 1);
	continue;
	}
	break;
	}
	rawTag = matches[1];
	tag = rawTag.toLowerCase(); //html is case insensitive

	tagStartLineNumber += (str.substring(0, str.indexOf('<')).match(/\n/g) \|\| []).length;
	tagEndLineNumber += (str.substring(0, str.length - matches[3].length).match(/\n/g) \|\| []).length;
	lineNumber = tagStartLineNumber;

	str = matches[3];

	//Identify close tag
	if (tag[0] === '/') {
	isCloseTag = true;
	tag = tag.substr(1);
	} else {
	isCloseTag = false;
	}

	//Do something per tag
	if (tag[0] === '!') {
	//Either doctype or comment, so ignore them
	if (tag.indexOf('![cdata[') === 0) {
	if (!regx.cdata.test(matches[0])) {
	broken = ERROR('CDATANotClosed', 'Line {0}: CDATA section not closed properly.', lineNumber);
	break;
	}
	str = matches[0].replace(regx.cdata, replaceComment);
	} else if (tag.indexOf('!--') === 0) {
	if (!regx.comment.test(matches[0])) {
	broken = ERROR('CommentNotClosed', 'Line {0}: HTML comment not closed properly.', lineNumber);
	break;
	}
	str = matches[0].replace(regx.comment, replaceComment);
	}
	continue;
	} else if (voidTags[tag]) {
	continue;
	} else if (singlelevel[tag]) {
	//prepare for counting the \n between start of tag and end angle bracket of end tag
	tagStartLineNumber = tagEndLineNumber;
	//remove everything upto end tag
	var specialEndTagRegex = new RegExp("^((?:.\|\\n)?)</" + tag + "[^>]>");
	if (!specialEndTagRegex.test(str.toLowerCase())) {
	broken = ERROR('MissingEndTag', 'Line {0}: {1} start tag missing corresponding end tag.', lineNumber, '<' + tag + '>');
	break;
	}
	str = str.replace(specialEndTagRegex, replaceSingleLevel);
	continue;
	}

	if (isCloseTag) {
	level -= 1;
	}
	if (level < 0) {
	broken = ERROR('ExtraTag', 'Line {0}: Extra end tag found: {1}', lineNumber, '<' + rawTag + '>');
	break;
	}
	if (!isCloseTag) {
	level += 1;
	}

	if (!isCloseTag) {
	stack.push({
	tag: tag,
	line: lineNumber
	});
	} else {
	last = stack[stack.length - 1];
	if (last.tag !== tag) {
	pos = -1;
	stack.some(function (o, index) {
	if (o.tag === tag) {
	pos = index;
	return true;
	}
	});
	if (pos < 0) {
	broken = ERROR('ExtraTag', 'Line {0}: Extra end tag found: {1}', lineNumber, '<' + rawTag + '>');
	} else {
	broken = ERROR('WrongTag', 'Line {0}: {1} start tag from line {2} should be closed before {3}.', lineNumber,
	'<' + last.tag + '>', last.line, '<' + rawTag + '>');
	}
	break;
	}
	stack.pop();
	}

	//Prepare for next tag.
	tagStartLineNumber = tagEndLineNumber;
	}
	if (!broken && stack.length > 0) {
	last = stack[stack.length - 1];
	broken = ERROR('MissingEndTag', 'Line {0}: {1} start tag missing corresponding end tag.', last.line, '<' + last.tag + '>');
	}
	return broken ? broken : true;
	};
	}));
	<!DOCTYPE html>
	<html>
	<head>
	<!--
	Test this a multi line comment. >>
	-->
	<meta title="unclosed meta is ok. unclosed div isn't ok."></meta>
	<script>
	var script = '<script>';
	</script>
	<![CDATA[This is a cdata
	section]]>
	</head>
	<body>
	<input type=text data-extra="text"></input>
	<
	div>
	<div>
	<div></div >
	</div>
	<x:blah></x:blah>
	<x-custom></x-custom>
	</body>
	</html>
	</html>