Skip to content

Instantly share code, notes, and snippets.

@Munawwar
Last active June 6, 2023 10:24
Show Gist options
  • Save Munawwar/8c46433e89f08850944d547acef367ca to your computer and use it in GitHub Desktop.
Save Munawwar/8c46433e89f08850944d547acef367ca to your computer and use it in GitHub Desktop.
Tiny HTML5 SAX Parser for browser
/*
* The smallest html sax parser - 0.5kb gzipped
*
* Usage: Find the comments/jsdoc of export below.
*/
// Regular Expressions for parsing tags and attributes
let startTagRegex = /(?:<([a-zA-Z][^\s\/>]*)(?:\s+[^\s\/>"'=]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*\s*(\/?)\s*>)|(?:<\/\s*([a-zA-Z][^\s\/>]*)>)|(?:<!--(.+?)-->)|(?:<!\[CDATA\[([^>]+)\]\]>)/ig,
// Void Tags - HTML 5
voidTags = new Set('area,base,br,col,embed,hr,img,input,keygen,link,meta,param,source,track,wbr'.split(',')),
// Raw Text Tags - HTML 5 (except <template> tag)
rawTextTags = new Set('script,style,textarea,title,template'.split(',')),
rawTextParser = /^([\s\S]*?)(<\/(?:script|style|textarea|title|template)[^>]*>)/i;
/**
* @param {string} html Assumes balanced, valid HTML as input
* @param {(
* type: 'tag'|'tagEnd'|'text'|'comment'|'cdata'|'content',
* matchedSubstring: string,
* contextualInfo: string,
* selfClosed: boolean,
* ) => undefined} callback contextualInfo is based on the `type`
* if type is `tag` or `tagEnd`, its the tag name
* if type is `text`, `comment` or `cdata`, its the nodeValue
* if type is `content`, its the textContent inside the script/style/textarea/title/template tag
*
* selfClosed is true or false for type = 'tag' type
*/
export default function parseHtml(html, callback) {
let lastIndex = 0, match, text;
startTagRegex.lastIndex = 0;
while ((match = startTagRegex.exec(html))) {
let [matchedSubString, tagStartName, unary, tagEndName, comment, cdata] = match;
if (lastIndex === startTagRegex.lastIndex) {
throw new Error('Parser error');
}
text = html.slice(lastIndex, startTagRegex.lastIndex - matchedSubString.length);
if (text) callback('text', text);
lastIndex = startTagRegex.lastIndex;
if (tagStartName) {
let tagLowercase = tagStartName.toLowerCase();
// Handle script, style and other text-only tags
if (rawTextTags.has(tagLowercase)) {
callback('tag', matchedSubString, tagStartName, false);
let [substr, content, endTagSubstr] = html.slice(lastIndex).match(rawTextParser);
callback('content', content);
callback('tagEnd', endTagSubstr, tagStartName);
lastIndex += substr.length;
startTagRegex.lastIndex = lastIndex;
} else {
callback('tag', matchedSubString, tagStartName, !!unary || voidTags.has(tagLowercase));
}
} else if (tagEndName) {
callback('tagEnd', matchedSubString, tagEndName);
// Comment
} else if (comment) {
callback('comment', matchedSubString, comment);
//CDATA
} else if (cdata) {
callback('cdata', matchedSubString, cdata);
}
}
text = html.slice(lastIndex);
if (text) callback('text', text);
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment