Skip to content

Instantly share code, notes, and snippets.

@Munawwar
Last active April 18, 2022 07:38
Show Gist options
  • Save Munawwar/9353965 to your computer and use it in GitHub Desktop.
Save Munawwar/9353965 to your computer and use it in GitHub Desktop.
Unbalanced HTML markup detection
/**
* Detect unsafe (and potentially unsafe) unbalanced tags in a given HTML snippet.
* Hints taken from an html parse (https://gist.github.com/cburgmer/2877758).
*
* Example:
* An unclosed div tag is considered unsafe, because if the snippet is pasted in between two div tags
* then it could end up breaking the HTML document.
* Self closing tags (tags that you can intentioanlly leave open like <table><tr><td>some text</table>) are also considered unsafe, for the same reason.
* However an unclosed void tag (like meta tag) is safe, because browsers will ignore it without any side effects.
*
* Usage: ValidateHtml('<html string>');
*/
(function (root, factory) {
if (typeof define === "function" && define.amd) {
define(factory);
} else if (typeof exports === 'object') { //For NodeJS
module.exports = factory();
} else { //For browsers
root.ValidateHtml = factory();
}
}(this, function () {
function unwrap(str) {
var arr = str.split(','), val, o = {};
while ((val = arr.pop())) {
o[val] = true;
}
return o;
}
function ERROR(status, msg) {
var arg = Array.prototype.slice.call(arguments, 2);
msg = msg.replace(/(^|[^\\])\{(\w+)\}/g, function (m, p, index) {
var x = arg[index];
return (p || '') + (x !== undefined ? x : '');
});
return {
status: status,
message: msg
};
}
//HTML 4 and 5 void tags
var voidTags = unwrap('area,base,basefont,br,col,command,embed,frame,hr,img,input,keygen,link,meta,param,source,track,wbr'),
singlelevel = unwrap('script,style'),
regxstr = {
tagname: "[\\-A-Za-z0-9_:]+",
attrname: "[\\w\\-]+",
attrvalue: (/(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+)/).toString().slice(1, -1) //quoted and unquoted strings
},
regx = {
// Start tag regex: /[^<]*<([\-A-Za-z0-9_]+:)(?:\s+[\w\-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*[^>]*>(.*)/,
opentag: new RegExp('^[^<]*?<(' + regxstr.tagname + ')' +
'(?:\\s+' + regxstr.attrname +
'(?:\\s*=\\s*' + regxstr.attrvalue + ')?' +
')*' +
'([^>]*?)>((?:.|\\n)*)'),
othertag: /^[^<]*?<([!\-\[\]\/A-Za-z0-9_:]+)([^>]*?)>((?:.|\n)*)/, //close tags, doctype, comments, cdata
comment: /^[^<]*?<!--(?:.|\n)*?-->/,
cdata: /^[^<]*?<!\[CDATA\[(?:.|\n)*?\]\]>/
};
return function (html) {
var str = html.replace(/[\r]/g, '').trim(),
tag, rawTag, isCloseTag,
matches, stack = [], lineNumber, tagStartLineNumber = 1, tagEndLineNumber = 1, last,
broken = '', level = 0,
replaceComment = function (m) {
tagEndLineNumber = tagStartLineNumber + (m.substr(m.indexOf('<')).match(/\n/g) || []).length;
//prepare for the next tag.
tagStartLineNumber = tagEndLineNumber;
return '';
},
replaceSingleLevel = function (m) {
tagEndLineNumber = tagStartLineNumber + (m.match(/\n/g) || []).length;
//prepare for the next tag.
tagStartLineNumber = tagEndLineNumber;
return '';
},
pos;
while (str) {
matches = (str.match(regx.opentag) || str.match(regx.othertag));
if (!matches) {
pos = str.indexOf('<');
if (pos >= 0) {
//add new lines.
tagStartLineNumber += (str.substr(0, pos).match(/\n/g) || []).length;
str = str.substr(pos + 1);
continue;
}
break;
}
rawTag = matches[1];
tag = rawTag.toLowerCase(); //html is case insensitive
tagStartLineNumber += (str.substring(0, str.indexOf('<')).match(/\n/g) || []).length;
tagEndLineNumber += (str.substring(0, str.length - matches[3].length).match(/\n/g) || []).length;
lineNumber = tagStartLineNumber;
str = matches[3];
//Identify close tag
if (tag[0] === '/') {
isCloseTag = true;
tag = tag.substr(1);
} else {
isCloseTag = false;
}
//Do something per tag
if (tag[0] === '!') {
//Either doctype or comment, so ignore them
if (tag.indexOf('![cdata[') === 0) {
if (!regx.cdata.test(matches[0])) {
broken = ERROR('CDATANotClosed', 'Line {0}: CDATA section not closed properly.', lineNumber);
break;
}
str = matches[0].replace(regx.cdata, replaceComment);
} else if (tag.indexOf('!--') === 0) {
if (!regx.comment.test(matches[0])) {
broken = ERROR('CommentNotClosed', 'Line {0}: HTML comment not closed properly.', lineNumber);
break;
}
str = matches[0].replace(regx.comment, replaceComment);
}
continue;
} else if (voidTags[tag]) {
continue;
} else if (singlelevel[tag]) {
//prepare for counting the \n between start of tag and end angle bracket of end tag
tagStartLineNumber = tagEndLineNumber;
//remove everything upto end tag
var specialEndTagRegex = new RegExp("^((?:.|\\n)*?)</" + tag + "[^>]*>");
if (!specialEndTagRegex.test(str.toLowerCase())) {
broken = ERROR('MissingEndTag', 'Line {0}: {1} start tag missing corresponding end tag.', lineNumber, '<' + tag + '>');
break;
}
str = str.replace(specialEndTagRegex, replaceSingleLevel);
continue;
}
if (isCloseTag) {
level -= 1;
}
if (level < 0) {
broken = ERROR('ExtraTag', 'Line {0}: Extra end tag found: {1}', lineNumber, '<' + rawTag + '>');
break;
}
if (!isCloseTag) {
level += 1;
}
if (!isCloseTag) {
stack.push({
tag: tag,
line: lineNumber
});
} else {
last = stack[stack.length - 1];
if (last.tag !== tag) {
pos = -1;
stack.some(function (o, index) {
if (o.tag === tag) {
pos = index;
return true;
}
});
if (pos < 0) {
broken = ERROR('ExtraTag', 'Line {0}: Extra end tag found: {1}', lineNumber, '<' + rawTag + '>');
} else {
broken = ERROR('WrongTag', 'Line {0}: {1} start tag from line {2} should be closed before {3}.', lineNumber,
'<' + last.tag + '>', last.line, '<' + rawTag + '>');
}
break;
}
stack.pop();
}
//Prepare for next tag.
tagStartLineNumber = tagEndLineNumber;
}
if (!broken && stack.length > 0) {
last = stack[stack.length - 1];
broken = ERROR('MissingEndTag', 'Line {0}: {1} start tag missing corresponding end tag.', last.line, '<' + last.tag + '>');
}
return broken ? broken : true;
};
}));
<!DOCTYPE html>
<html>
<head>
<!--
Test this a multi line comment. >>
-->
<meta title="unclosed meta is ok. unclosed div isn't ok."></meta>
<script>
var script = '<script>';
</script>
<![CDATA[This is a cdata
section]]>
</head>
<body>
<input type=text data-extra="text"></input>
<
div>
<div>
<div></div >
</div>
<x:blah></x:blah>
<x-custom></x-custom>
</body>
</html>
</html>
<!DOCTYPE html>
<html>
<head>
<script src="html-validator.js"></script>
</head>
<body>
<script>
var xhr = new XMLHttpRequest();
xhr.open('GET', 'test.html', false);
xhr.send();
console.log(ValidateHtml(xhr.responseText));
</script>
</body>
</html>
<!DOCTYPE HTML>
<html>
<head>
<script src="html-validator.js"></script>
</head>
<body>
<textarea id="input" rows="25" style="width:100%"></textarea>
<textarea id="output" readonly rows="5" style="width:100%"></textarea>
<button id="btn">Valdate HTML</button>
<script type="text/javascript">
(function () {
var input = document.querySelector('#input'),
output = document.querySelector('#output');
document.querySelector('#btn').onclick = function () {
output.value = JSON.stringify(window.ValidateHtml(input.value), null, 2);
};
}());
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment