Skip to content

Instantly share code, notes, and snippets.

@sstur
Last active February 9, 2017 19:09
Show Gist options
  • Save sstur/6363525 to your computer and use it in GitHub Desktop.
Save sstur/6363525 to your computer and use it in GitHub Desktop.
Pure JS HTML Parser (ported from CKEditor 4.2)
/*!
* HTML Parser
* Ported from CKEditor 4.2 (f74e558351)
*
*/
/*global require, exports, module, define */
var HTMLParser;
(function(definition) {
if (typeof exports == 'object' && typeof module == 'object') {
// CommonJS/Node
return definition(require, exports, module);
}
if (typeof define == 'function') {
//AMD or Other
return define.amd ? define(['exports'], definition) : define('html-parser', definition);
}
var module = {exports: {}};
definition(function() {}, module.exports, module);
return (HTMLParser = module.exports);
})(function(require, exports, module) {
"use strict";
var tools = {
createClass: function(definition) {
var $ = definition.$;
var privates = definition.privates || definition._;
var proto = definition.proto;
var statics = definition.statics;
if (privates) {
var originalConstructor = $;
$ = function() {
// Create (and get) the private namespace.
var _ = this._ || (this._ = {});
// Make some magic so "this" will refer to the main
// instance when coding private functions.
for (var privateName in privates) {
var priv = privates[privateName];
_[privateName] = (typeof priv == 'function') ? tools.bind(priv, this) : priv;
}
originalConstructor.apply(this, arguments);
};
}
if (proto)
this.extend($.prototype, proto, true);
if (statics)
this.extend($, statics, true);
return $;
},
bind: function(func, obj) {
return function() {
return func.apply(obj, arguments);
};
},
clone: function(obj) {
var clone;
// Array.
if (obj && (obj instanceof Array)) {
clone = [];
for (var i = 0; i < obj.length; i++)
clone[i] = tools.clone(obj[i]);
return clone;
}
// "Static" types.
if (obj === null || (typeof(obj) != 'object') || (obj instanceof String) || (obj instanceof Number) || (obj instanceof Boolean) || (obj instanceof Date) || (obj instanceof RegExp)) {
return obj;
}
// Objects.
clone = new obj.constructor();
for (var propertyName in obj) {
var property = obj[propertyName];
clone[propertyName] = tools.clone(property);
}
return clone;
},
htmlEncodeAttr: function(text) {
return text.replace(/"/g, '&quot;').replace(/</g, '&lt;').replace(/>/g, '&gt;');
},
extend: function(target) {
var argsLength = arguments.length;
var overwrite, propertiesList;
if (typeof(overwrite = arguments[argsLength - 1]) == 'boolean') {
argsLength--;
} else
if (typeof(overwrite = arguments[argsLength - 2]) == 'boolean') {
propertiesList = arguments[argsLength - 1];
argsLength -= 2;
}
for (var i = 1; i < argsLength; i++) {
var source = arguments[i];
for (var propertyName in source) {
// Only copy existed fields if in overwrite mode.
if (overwrite === true || target[propertyName] == undefined) {
// Only copy specified fields if list is provided.
if (!propertiesList || (propertyName in propertiesList))
target[propertyName] = source[propertyName];
}
}
}
return target;
},
trim: (function() {
// We are not using \s because we don't want "non-breaking spaces" to be caught.
var trimRegex = /^[ \t\n\r]*|[ \t\n\r]*$/g;
return function(str) {
return str.replace(trimRegex, '');
};
})(),
indexOf: function(array, value) {
if (typeof value == 'function') {
for (var i = 0, len = array.length; i < len; i++) {
if (value(array[i]))
return i;
}
} else
if (array.indexOf) {
return array.indexOf(value);
} else {
for (i = 0, len = array.length; i < len; i++) {
if (array[i] === value)
return i;
}
}
return -1;
}
};
var NODE_ELEMENT = 1;
var NODE_TEXT = 3;
var NODE_PROCESSING_INSTRUCTION = 7;
var NODE_COMMENT = 8;
var NODE_DOCUMENT_TYPE = 10;
var NODE_DOCUMENT_FRAGMENT = 11;
var dtd = (function() {
var X = tools.extend;
// Subtraction rest of sets, from the first set.
var Y = function(source, removed) {
var substracted = tools.clone(source);
for (var i = 1; i < arguments.length; i++) {
removed = arguments[i];
for(var name in removed)
delete substracted[name];
}
return substracted;
};
var P = {}, F = {};
// Intersection of flow elements set and phrasing elements set.
var PF = {a: 1, abbr: 1, area: 1, audio: 1, b: 1, bdi: 1, bdo: 1, br: 1, button: 1, canvas: 1, cite: 1, code: 1, command: 1, datalist: 1, del: 1, dfn: 1, em: 1, embed: 1, i: 1, iframe: 1, img: 1, input: 1, ins: 1, kbd: 1, keygen: 1, label: 1, map: 1, mark: 1, meter: 1, noscript: 1, object: 1, output: 1, progress: 1, q: 1, ruby: 1, s: 1, samp: 1, script: 1, select: 1, small: 1, span: 1, strong: 1, sub: 1, sup: 1, textarea: 1, time: 1, u: 1, 'var': 1, video: 1, wbr: 1};
// F - PF (Flow Only).
var FO = {address: 1, article: 1, aside: 1, blockquote: 1, details: 1, div: 1, dl: 1, fieldset: 1, figure: 1, footer: 1, form: 1, h1: 1, h2: 1, h3: 1, h4: 1, h5: 1, h6: 1, header: 1, hgroup: 1, hr: 1, menu: 1, nav: 1, ol: 1, p: 1, pre: 1, section: 1, table: 1, ul: 1};
// Metadata elements.
var M = {command: 1, link: 1, meta: 1, noscript: 1, script: 1, style: 1};
// Empty.
var E = {};
// Text.
var T = {'#': 1};
// Deprecated phrasing elements.
var DP = {acronym: 1, applet: 1, basefont: 1, big: 1, font: 1, isindex: 1, strike: 1, style: 1, tt: 1}; // TODO remove "style".
// Deprecated flow only elements.
var DFO = {center: 1, dir: 1, noframes: 1};
// Phrasing elements := PF + T + DP
X(P, PF, T, DP);
// Flow elements := FO + P + DFO
X(F, FO, P, DFO);
var dtd = {
a: Y(P, {a: 1, button: 1}), // Treat as normal inline element (not a transparent one).
abbr: P,
address: F,
area: E,
article: X({style: 1}, F),
aside: X({style: 1}, F),
audio: X({source: 1, track: 1}, F),
b: P,
base: E,
bdi: P,
bdo: P,
blockquote: F,
body: F,
br: E,
button: Y(P, {a: 1 ,button: 1}),
canvas: P, // Treat as normal inline element (not a transparent one).
caption: F,
cite: P,
code: P,
col: E,
colgroup: {col: 1},
command: E,
datalist: X({option: 1}, P),
dd: F,
del: P, // Treat as normal inline element (not a transparent one).
details: X({summary: 1}, F),
dfn: P,
div: X({style: 1}, F),
dl: {dt: 1, dd: 1},
dt: F,
em: P,
embed: E,
fieldset: X({legend: 1}, F),
figcaption: F,
figure: X({figcaption: 1}, F),
footer: F,
form: F,
h1: P,
h2: P,
h3: P,
h4: P,
h5: P,
h6: P,
head: X({title: 1, base: 1}, M),
header: F,
hgroup: {h1: 1, h2: 1, h3: 1, h4: 1, h5: 1, h6: 1},
hr: E,
html: X({head: 1, body: 1}, F, M), // Head and body are optional...
i: P,
iframe: T,
img: E,
input: E,
ins: P, // Treat as normal inline element (not a transparent one).
kbd: P,
keygen: E,
label: P,
legend: P,
li: F,
link: E,
map: F,
mark: P, // Treat as normal inline element (not a transparent one).
menu: X({li: 1}, F),
meta: E,
meter: Y(P, {meter: 1}),
nav: F,
noscript: X({link: 1, meta: 1, style: 1}, P), // Treat as normal inline element (not a transparent one).
object: X({param: 1}, P), // Treat as normal inline element (not a transparent one).
ol: {li: 1},
optgroup: {option: 1},
option: T,
output: P,
p: P,
param: E,
pre: P,
progress: Y(P, {progress: 1}),
q: P,
rp: P,
rt: P,
ruby: X({rp: 1, rt: 1}, P),
s: P,
samp: P,
script: T,
section: X({style: 1}, F),
select: {optgroup: 1, option: 1},
small: P,
source: E,
span: P,
strong: P,
style: T,
sub: P,
summary: P,
sup: P,
table: {caption: 1, colgroup: 1, thead: 1, tfoot: 1, tbody: 1, tr: 1},
tbody: {tr: 1},
td: F,
textarea: T,
tfoot: {tr: 1},
th: F,
thead: {tr: 1},
time: Y(P, {time: 1}),
title: T,
tr: {th: 1, td: 1},
track: E,
u: P,
ul: {li: 1},
'var': P,
video: X({source: 1, track: 1}, F),
wbr: E,
// Deprecated tags.
acronym: P,
applet: X({param: 1}, F),
basefont: E,
big: P,
center: F,
dialog: E,
dir: {li: 1},
font: P,
isindex: E,
noframes: F,
strike: P,
tt: P
};
X(dtd, {
/**
* List of block elements, like `<p>` or `<div>`.
*/
$block: X({audio: 1, dd: 1, dt: 1, li: 1, video: 1}, FO, DFO),
/**
* List of elements that contains other blocks, in which block-level operations should be limited,
* this property is not intended to be checked directly, use {@link dom.elementPath#blockLimit} instead.
*
* Some examples of editor behaviors that are impacted by block limits:
*
* * Enter key never split a block-limit element;
* * Style application is constraint by the block limit of the current selection.
* * Pasted html will be inserted into the block limit of the current selection.
*
* **Note:** As an exception `<li>` is not considered as a block limit, as it's generally used as a text block.
*/
$blockLimit: {article: 1, aside: 1, audio: 1, body: 1, caption: 1, details: 1, dir: 1, div: 1, dl: 1, fieldset: 1, figure: 1, footer: 1, form: 1, header: 1, hgroup: 1, menu: 1, nav: 1, ol: 1, section: 1, table: 1, td: 1, th: 1, tr: 1, ul: 1, video: 1},
/**
* List of elements that contain character data.
*/
$cdata: {script: 1, style: 1},
/**
* List of elements that are accepted as inline editing hosts.
*/
$editable: {address: 1, article: 1, aside: 1, blockquote: 1, body: 1, details: 1, div: 1, fieldset: 1, footer: 1, form: 1, h1: 1, h2: 1, h3: 1, h4: 1, h5: 1, h6: 1, header: 1, hgroup: 1, nav: 1, p: 1, pre: 1, section: 1},
/**
* List of empty (self-closing) elements, like `<br>` or `<img>`.
*/
$empty: {area: 1, base: 1, basefont: 1, br: 1, col: 1, command: 1, dialog: 1, embed: 1, hr: 1, img: 1, input: 1, isindex: 1, keygen: 1, link: 1, meta: 1, param: 1, source: 1, track: 1, wbr: 1},
/**
* List of inline (`<span>` like) elements.
*/
$inline: P,
/**
* List of list root elements.
*/
$list: {dl: 1, ol: 1, ul: 1},
/**
* List of list item elements, like `<li>` or `<dd>`.
*/
$listItem: {dd: 1, dt: 1, li: 1},
/**
* List of elements which may live outside body.
*/
$nonBodyContent: X({body: 1, head: 1, html: 1}, dtd.head),
/**
* Elements that accept text nodes, but are not possible to edit into the browser.
*/
$nonEditable: {applet: 1, audio: 1, button: 1, embed: 1, iframe: 1, map: 1, object: 1, option: 1, param: 1, script: 1, textarea: 1, video: 1},
/**
* Elements that are considered objects, therefore selected as a whole in the editor.
*/
$object: {applet: 1, audio: 1, button: 1, hr: 1, iframe: 1, img: 1, input: 1, object: 1, select: 1, table: 1, textarea: 1, video: 1},
/**
* List of elements that can be ignored if empty, like `<b>` or `<span>`.
*/
$removeEmpty: {abbr: 1, acronym: 1, b: 1, bdi: 1, bdo: 1, big: 1, cite: 1, code: 1, del: 1, dfn: 1, em: 1, font: 1, i: 1, ins: 1, label: 1, kbd: 1, mark: 1, meter: 1, output: 1, q: 1, ruby: 1, s: 1, samp: 1, small: 1, span: 1, strike: 1, strong: 1, sub: 1, sup: 1, time: 1, tt: 1, u: 1, 'var': 1},
/**
* List of elements that have tabindex set to zero by default.
*/
$tabIndex: {a: 1, area: 1, button: 1, input: 1, object: 1, select: 1, textarea: 1},
/**
* List of elements used inside the `<table>` element, like `<tbody>` or `<td>`.
*/
$tableContent: {caption: 1, col: 1, colgroup: 1, tbody: 1, td: 1, tfoot: 1, th: 1, thead: 1, tr: 1},
/**
* List of "transparent" elements. See [W3C's definition of "transparent" element](http://dev.w3.org/html5/markup/terminology.html#transparent).
*/
$transparent: {a: 1, audio: 1, canvas: 1, del: 1, ins: 1, map: 1, noscript: 1, object: 1, video: 1},
/**
* List of elements that are not to exist standalone that must live under it's parent element.
*/
$intermediate: {caption: 1, colgroup: 1, dd: 1, dt: 1, figcaption: 1, legend: 1, li: 1, optgroup: 1, option: 1, rp: 1, rt: 1, summary: 1, tbody: 1, td: 1, tfoot: 1, th: 1, thead: 1, tr: 1}
});
return dtd;
})();
/**
* Provides an "event like" system to parse strings of HTML data.
*
* var parser = new Parser();
* parser.onTagOpen = function(tagName, attributes, selfClosing) {
* alert(tagName);
* };
* parser.parse('<p>Some <b>text</b>.</p>'); // Alerts 'p', 'b'.
*
* @class
* @constructor Creates a Parser class instance.
*/
var Parser = module.exports = function Parser(opts) {
this._ = {
opts: opts || {},
htmlPartsRegex: new RegExp('<(?:(?:\\/([^>]+)>)|(?:!--([\\S|\\s]*?)-->)|(?:([^\\s>]+)\\s*((?:(?:"[^"]*")|(?:\'[^\']*\')|[^"\'>])*)\\/?>))', 'g')
};
};
(function() {
var attribsRegex = /([\w\-:.]+)(?:(?:\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)')|([^\s>]+)))|(?=\s|$))/g;
var emptyAttribs = {checked: 1, compact: 1, declare: 1, defer: 1, disabled: 1, ismap: 1, multiple: 1, nohref: 1, noresize: 1, noshade: 1, nowrap: 1, readonly: 1, selected: 1};
Parser.prototype = {
/**
* Function to be fired when a tag opener is found. This function
* should be overriden when using this class.
*
* var parser = new Parser();
* parser.onTagOpen = function(tagName, attributes, selfClosing) {
* alert(tagName); // e.g. 'b'
* });
* parser.parse('<!-- Example --><b>Hello</b>');
*
* @param {String} tagName The tag name. The name is guarantted to be lowercased.
* @param {Object} attributes An object containing all tag attributes. Each
* property in this object represent and attribute name and its value is the attribute value.
* @param {Boolean} selfClosing `true` if the tag closes itself, false if the tag doesn't.
*/
onTagOpen: function() {},
/**
* Function to be fired when a tag closer is found. This function
* should be overriden when using this class.
*
* var parser = new Parser();
* parser.onTagClose = function(tagName) {
* alert(tagName); // 'b'
* });
* parser.parse('<!-- Example --><b>Hello</b>');
*
* @param {String} tagName The tag name. The name is guarantted to be lowercased.
*/
onTagClose: function() {},
/**
* Function to be fired when text is found. This function
* should be overriden when using this class.
*
* var parser = new Parser();
* parser.onText = function(text) {
* alert(text); // 'Hello'
* });
* parser.parse('<!-- Example --><b>Hello</b>');
*
* @param {String} text The text found.
*/
onText: function() {},
/**
* Function to be fired when CDATA section is found. This function
* should be overriden when using this class.
*
* var parser = new Parser();
* parser.onCDATA = function(cdata) {
* alert(cdata); // 'var hello;'
* });
* parser.parse('<script>var hello;</script>');
*
* @param {String} cdata The CDATA been found.
*/
onCDATA: function() {},
/**
* Function to be fired when a commend is found. This function
* should be overriden when using this class.
*
* var parser = new Parser();
* parser.onComment = function(comment) {
* alert(comment); // ' Example '
* });
* parser.parse('<!-- Example --><b>Hello</b>');
*
* @param {String} comment The comment text.
*/
onComment: function() {},
/**
* Parses text, looking for HTML tokens, like tag openers or closers,
* or comments. This function fires the onTagOpen, onTagClose, onText
* and onComment function during its execution.
*
* var parser = new Parser();
* // The onTagOpen, onTagClose, onText and onComment should be overriden
* // at this point.
* parser.parse('<!-- Example --><b>Hello</b>');
*
* @param {String} html The HTML to be parsed.
*/
parse: function(html) {
var parts, tagName;
var nextIndex = 0;
var cdata; // The collected data inside a CDATA section.
while ((parts = this._.htmlPartsRegex.exec(html))) {
var tagIndex = parts.index;
if (tagIndex > nextIndex) {
var text = html.substring(nextIndex, tagIndex);
if (cdata) {
cdata.push(text);
} else {
this.onText(text);
}
}
nextIndex = this._.htmlPartsRegex.lastIndex;
/*
"parts" is an array with the following items:
0 : The entire match for opening/closing tags and comments.
1 : Group filled with the tag name for closing tags.
2 : Group filled with the comment text.
3 : Group filled with the tag name for opening tags.
4 : Group filled with the attributes part of opening tags.
*/
if (!cdata) {
// Doctype
var firstTwoChars = parts[0].slice(0, 2);
if (firstTwoChars == '<!' && parts[0].slice(2, 9).toLowerCase() == 'doctype') {
if (this.onDoctype) this.onDoctype(tools.trim(parts[0].slice(9, -1)));
continue;
}
// XML Declaration
if (firstTwoChars == '<?' && parts[0].slice(2, 5).toLowerCase() == 'xml') {
if (this.onXmlDecl) this.onXmlDecl(tools.trim(parts[0].slice(5, -2)));
continue;
}
}
// Closing tag
if ((tagName = parts[1])) {
tagName = tagName.toLowerCase();
if (cdata && dtd.$cdata[tagName]) {
// Send the CDATA data.
this.onCDATA(cdata.join(''));
cdata = null;
}
if (!cdata) {
this.onTagClose(tagName);
continue;
}
}
// If CDATA is enabled, just save the raw match.
if (cdata) {
cdata.push(parts[0]);
continue;
}
// Opening tag
if ((tagName = parts[3])) {
tagName = tagName.toLowerCase();
// There are some tag names that can break things, so let's
// simply ignore them when parsing. (#5224)
if (/="/.test(tagName))
continue;
var attribs = {};
var attribMatch;
var attribsPart = parts[4];
var selfClosing = !!(attribsPart && attribsPart.charAt(attribsPart.length - 1) == '/');
if (attribsPart) {
while ((attribMatch = attribsRegex.exec(attribsPart))) {
var attName = attribMatch[1].toLowerCase();
var attValue = attribMatch[2] || attribMatch[3] || attribMatch[4] || '';
if (!attValue && emptyAttribs[attName]) {
attribs[attName] = attName;
} else {
attribs[attName] = attValue;
}
}
}
this.onTagOpen(tagName, attribs, selfClosing);
// Open CDATA mode when finding the appropriate tags.
if (!cdata && dtd.$cdata[tagName])
cdata = [];
continue;
}
// Comment
if ((tagName = parts[2]))
this.onComment(tagName);
}
if (html.length > nextIndex)
this.onText(html.substring(nextIndex, html.length));
}
};
})();
Parser.BasicWriter = tools.createClass({
/**
* Creates a basicWriter class instance.
*
* @constructor
*/
$: function(opts) {
this._ = {
opts: opts || {},
output: []
};
},
proto: {
/**
* Writes the tag opening part for a opener tag.
*
* // Writes '<p'.
* writer.openTag('p', {class : 'MyClass', id : 'MyId'});
*
* @param {String} tagName The element name for this tag.
* @param {Object} attributes The attributes defined for this tag. The
* attributes could be used to inspect the tag.
*/
openTag: function(tagName, attributes) {
this._.output.push('<', tagName);
},
/**
* Writes the tag closing part for a opener tag.
*
* // Writes '>'.
* writer.openTagClose('p', false);
*
* // Writes ' />'.
* writer.openTagClose('br', true);
*
* @param {String} tagName The element name for this tag.
* @param {Boolean} isSelfClose Indicates that this is a self-closing tag,
* like `<br>` or `<img>`.
*/
openTagClose: function(tagName, isSelfClose) {
this._.output.push(this._.opts.xhtml && isSelfClose ? ' />' : '>');
},
/**
* Writes an attribute. This function should be called after opening the
* tag with {@link #openTagClose}.
*
* // Writes ' class="MyClass"'.
* writer.attribute('class', 'MyClass');
*
* @param {String} attName The attribute name.
* @param {String} attValue The attribute value.
*/
attribute: function(attName, attValue) {
// Browsers don't always escape special character in attribute values. (#4683, #4719).
if (typeof attValue == 'string')
attValue = tools.htmlEncodeAttr(attValue);
this._.output.push(' ', attName, '="', attValue, '"');
},
/**
* Writes a closer tag.
*
* // Writes '</p>'.
* writer.closeTag('p');
*
* @param {String} tagName The element name for this tag.
*/
closeTag: function(tagName) {
this._.output.push('</', tagName, '>');
},
/**
* Writes text.
*
* // Writes 'Hello Word'.
* writer.text('Hello Word');
*
* @param {String} text The text value.
*/
text: function(text) {
this._.output.push(text);
},
/**
* Writes a comment.
*
* // Writes '<!-- My comment -->'.
* writer.comment(' My comment ');
*
* @param {String} comment The comment text.
*/
comment: function(comment) {
this._.output.push('<!--', comment, '-->');
},
/**
* Writes a doctype.
*
* // Writes '<!doctype html>'.
* writer.doctype('html');
*
* @param {String} doctype The doctype text.
*/
doctype: function(value) {
this._.output.push('<!doctype ', value, '>');
},
/**
* Writes a xmlDecl.
*
* // Writes '<?xml version="1.0" ?>'.
* writer.xmlDecl('version="1.0"');
*
* @param {String} xmlDecl The xmlDecl text.
*/
xmlDecl: function(value) {
this._.output.push('<?xml ', value, ' ?>');
},
/**
* Writes any kind of data to the ouput.
*
* writer.write('This is an <b>example</b>.');
*
* @param {String} data
*/
write: function(data) {
this._.output.push(data);
},
/**
* Empties the current output buffer.
*
* writer.reset();
*/
reset: function() {
this._.output = [];
this._.indent = false;
},
/**
* Empties the current output buffer.
*
* var html = writer.getHtml();
*
* @param {Boolean} reset Indicates that the {@link #reset} method is to
* be automatically called after retrieving the HTML.
* @returns {String} The HTML written to the writer so far.
*/
getHtml: function(reset) {
var html = this._.output.join('');
if (reset)
this.reset();
return html;
}
}
});
/**
* A lightweight representation of HTML node.
*
* @since 4.1
* @class
* @constructor Creates a node class instance.
*/
Parser.Node = function Node() {};
Parser.Node.prototype = {
/**
* Remove this node from a tree.
*
* @since 4.1
*/
remove: function() {
var children = this.parent.children;
var index = tools.indexOf(children, this);
var previous = this.previous;
var next = this.next;
previous && (previous.next = next);
next && (next.previous = previous);
children.splice(index, 1);
this.parent = null;
},
/**
* Replace this node with given one.
*
* @since 4.1
* @param {Parser.Node} node The node that will replace this one.
*/
replaceWith: function(node) {
var children = this.parent.children;
var index = tools.indexOf(children, this);
var previous = node.previous = this.previous;
var next = node.next = this.next;
previous && (previous.next = node);
next && (next.previous = node);
children[index] = node;
node.parent = this.parent;
this.parent = null;
},
/**
* Insert this node after given one.
*
* @since 4.1
* @param {Parser.Node} node The node that will precede this element.
*/
insertAfter: function(node) {
var children = node.parent.children;
var index = tools.indexOf(children, node);
var next = node.next;
children.splice(index + 1, 0, this);
this.next = node.next;
this.previous = node;
node.next = this;
next && (next.previous = this);
this.parent = node.parent;
},
/**
* Insert this node before given one.
*
* @since 4.1
* @param {Parser.Node} node The node that will follow this element.
*/
insertBefore: function(node) {
var children = node.parent.children;
var index = tools.indexOf(children, node);
children.splice(index, 0, this);
this.next = node;
this.previous = node.previous;
node.previous && (node.previous.next = this);
node.previous = this;
this.parent = node.parent;
}
};
/**
* A lightweight representation of HTML CDATA.
*
* @class
* @extends Parser.Node
* @constructor Creates a cdata class instance.
* @param {String} value The CDATA section value.
*/
Parser.CData = function CData(value) {
/**
* The CDATA value.
*
* @property {String}
*/
this.value = value;
};
Parser.CData.prototype = tools.extend(new Parser.Node(), {
/**
* CDATA has the same type as {@link Parser.Text} This is
* a constant value set to {@link NODE_TEXT}.
*
* @readonly
* @property {Number} [=NODE_TEXT]
*/
type: NODE_TEXT,
/**
* Writes the CDATA with no special manipulations.
*
* @param {Parser.BasicWriter} writer The writer to which write the HTML.
*/
writeHtml: function(writer) {
writer.write(this.value);
}
});
/**
* A lightweight representation of an HTML comment.
*
* @class
* @extends Parser.Node
* @constructor Creates a comment class instance.
* @param {String} value The comment text value.
*/
Parser.Comment = function Comment(value) {
/**
* The comment text.
*
* @property {String}
*/
this.value = value;
/** @private */
this._ = {
isBlockLike: false
};
};
Parser.Comment.prototype = tools.extend(new Parser.Node(), {
/**
* The node type. This is a constant value set to {@link NODE_COMMENT}.
*
* @readonly
* @property {Number} [=NODE_COMMENT]
*/
type: NODE_COMMENT,
/**
* Writes the HTML representation of this comment to a htmlWriter.
*
* @param {Parser.BasicWriter} writer The writer to which write the HTML.
*/
writeHtml: function(writer) {
writer.comment(this.value);
}
});
/**
* A lightweight representation of an HTML Doctype Declaration.
*
* @class
* @extends Parser.Node
* @constructor Creates a doctype class instance.
* @param {String} value The doctype value.
*/
Parser.Doctype = function Doctype(value) {
/**
* The doctype text.
*
* @property {String}
*/
this.value = value;
/** @private */
this._ = {
isBlockLike: false
};
};
Parser.Doctype.prototype = tools.extend(new Parser.Node(), {
/**
* The node type. This is a constant value set to {@link NODE_DOCUMENT_TYPE}.
*
* @readonly
* @property {Number} [=NODE_DOCUMENT_TYPE]
*/
type: NODE_DOCUMENT_TYPE,
/**
* Writes the HTML representation of this doctype to a htmlWriter.
*
* @param {Parser.BasicWriter} writer The writer to which write the HTML.
*/
writeHtml: function(writer) {
writer.doctype(this.value);
}
});
/**
* A lightweight representation of an HTML XML Declaration.
*
* @class
* @extends Parser.Node
* @constructor Creates a xmlDecl class instance.
* @param {String} value The xmlDecl value.
*/
Parser.XmlDecl = function XmlDecl(value) {
/**
* The xmlDecl text.
*
* @property {String}
*/
this.value = value;
/** @private */
this._ = {
isBlockLike: false
};
};
Parser.XmlDecl.prototype = tools.extend(new Parser.Node(), {
/**
* The node type. This is a constant value set to {@link NODE_PROCESSING_INSTRUCTION}.
*
* @readonly
* @property {Number} [=NODE_PROCESSING_INSTRUCTION]
*/
type: NODE_PROCESSING_INSTRUCTION,
/**
* Writes the HTML representation of this xmlDecl to a htmlWriter.
*
* @param {Parser.BasicWriter} writer The writer to which write the HTML.
*/
writeHtml: function(writer) {
writer.xmlDecl(this.value);
}
});
/**
* A lightweight representation of an HTML element.
*
* @class
* @extends Parser.Node
* @constructor Creates an element class instance.
* @param {String} name The element name.
* @param {Object} attributes And object holding all attributes defined for
* this element.
*/
Parser.Element = function Element(name, attributes) {
/**
* The element name.
*
* @property {String}
*/
this.name = name;
/**
* Holds the attributes defined for this element.
*
* @property {Object}
*/
this.attributes = attributes || {};
/**
* The nodes that are direct children of this element.
*/
this.children = [];
// Reveal the real semantic of our internal custom tag name (#6639),
// when resolving whether it's block like.
var realName = name || '';
var prefixed = realName.match(/^cke:(.*)/);
prefixed && (realName = prefixed[1]);
var isBlockLike = !!(dtd.$nonBodyContent[realName] || dtd.$block[realName] || dtd.$listItem[realName] || dtd.$tableContent[realName] || dtd.$nonEditable[realName] || realName == 'br');
this.isEmpty = !!dtd.$empty[name];
this.isUnknown = !dtd[name];
/** @private */
this._ = {
isBlockLike: isBlockLike,
hasInlineStarted: this.isEmpty || !isBlockLike
};
};
Parser.Fragment = function Fragment() {
/**
* The nodes contained in the root of this fragment.
*
* var fragment = Parser.Fragment.fromHtml('<b>Sample</b> Text');
* alert(fragment.children.length); // 2
*/
this.children = [];
/**
* Get the fragment parent. Should always be null.
*
* @property {Object} [=null]
*/
this.parent = null;
/** @private */
this._ = {
isBlockLike: true,
hasInlineStarted: false
};
};
(function() {
// Block-level elements whose internal structure should be respected during
// parser fixing.
var nonBreakingBlocks = tools.extend({table: 1, ul: 1, ol: 1, dl: 1}, dtd.table, dtd.ul, dtd.ol, dtd.dl);
var listBlocks = {ol: 1, ul: 1};
// Dtd of the fragment element, basically it accept anything except for intermediate structure, e.g. orphan <li>.
var rootDtd = tools.extend({}, {html: 1}, dtd.html, dtd.body, dtd.head, {style: 1, script: 1});
function isRemoveEmpty(node) {
// Empty link is to be removed when empty but not anchor. (#7894)
return node.name == 'a' && node.attributes.href || dtd.$removeEmpty[node.name];
}
/**
* Creates a {@link Parser.Fragment} from an HTML string.
*
* var fragment = Parser.Fragment.fromHtml('<b>Sample</b> Text');
* alert(fragment.children[0].name); // 'b'
* alert(fragment.children[1].value); // ' Text'
*
* @static
* @param {String} fragmentHtml The HTML to be parsed, filling the fragment.
* @param {Parser.Element/String} [parent] Optional contextual
* element which makes the content been parsed as the content of this element and fix
* to match it.
* If not provided, then {@link Parser.Fragment} will be used
* as the parent and it will be returned.
* @param {String/Boolean} [fixingBlock] When `parent` is a block limit element,
* and the param is a string value other than `false`, it is to
* avoid having block-less content as the direct children of parent by wrapping
* the content with a block element of the specified tag, e.g.
* when `fixingBlock` specified as `p`, the content `<body><i>foo</i></body>`
* will be fixed into `<body><p><i>foo</i></p></body>`.
* @returns {Parser.Fragment/Parser.Element} The created fragment or passed `parent`.
*/
Parser.Fragment.fromHtml = function(fragmentHtml, parent, fixingBlock, opts) {
var parser = new Parser(opts);
var root = parent instanceof Parser.Element ? parent : typeof parent == 'string' ? new Parser.Element(parent) : new Parser.Fragment();
var pendingInline = [];
var pendingBRs = [];
var currentNode = root;
// Indicate we're inside a <textarea> element, spaces should be touched differently.
var inTextarea = root.name == 'textarea';
// Indicate we're inside a <pre> element, spaces should be touched differently.
var inPre = root.name == 'pre';
function checkPending(newTagName) {
var pendingBRsSent;
if (pendingInline.length > 0) {
for (var i = 0; i < pendingInline.length; i++) {
var pendingElement = pendingInline[i];
var pendingName = pendingElement.name;
var pendingDtd = dtd[pendingName];
var currentDtd = currentNode.name && dtd[currentNode.name];
if ((!currentDtd || currentDtd[pendingName]) && (!newTagName || !pendingDtd || pendingDtd[newTagName] || !dtd[newTagName])) {
if (!pendingBRsSent) {
sendPendingBRs();
pendingBRsSent = 1;
}
// Get a clone for the pending element.
pendingElement = pendingElement.clone();
// Add it to the current node and make it the current,
// so the new element will be added inside of it.
pendingElement.parent = currentNode;
currentNode = pendingElement;
// Remove the pending element (back the index by one
// to properly process the next entry).
pendingInline.splice(i, 1);
i--;
} else {
// Some element of the same type cannot be nested, flat them,
// e.g. <a href="#">foo<a href="#">bar</a></a>. (#7894)
if (pendingName == currentNode.name)
addElement(currentNode, currentNode.parent, 1), i--;
}
}
}
}
function sendPendingBRs() {
while (pendingBRs.length)
addElement(pendingBRs.shift(), currentNode);
}
// Beside of simply append specified element to target, this function also takes
// care of other dirty lifts like forcing block in body, etc.
//
// @param {Element} element The element to be added as the last child of {@link target}.
// @param {Element} target The parent element to relieve the new node.
// @param {Boolean} [moveCurrent=false] Don't change the "currentNode" global unless
// there's a return point node specified on the element, otherwise move current onto {@link target} node.
//
function addElement(element, target, moveCurrent) {
target = target || currentNode || root;
// Current element might be mangled by fix body below,
// save it for restore later.
var savedCurrent = currentNode;
// Ignore any element that has already been added.
if (element.previous === undefined) {
if (checkAutoParagraphing(target, element)) {
// Create a <p> in the fragment.
currentNode = target;
parser.onTagOpen(fixingBlock, {});
// The new target now is the <p>.
element.returnPoint = target = currentNode;
}
// Avoid adding empty inline.
if (!(isRemoveEmpty(element) && !element.children.length))
target.add(element);
if (element.name == 'pre')
inPre = false;
if (element.name == 'textarea')
inTextarea = false;
}
if (element.returnPoint) {
currentNode = element.returnPoint;
delete element.returnPoint;
} else {
currentNode = moveCurrent ? target : savedCurrent;
}
}
// Auto paragraphing should happen when inline content enters the root element.
function checkAutoParagraphing(parent, node) {
// Check for parent that can contain block.
if ((parent == root || parent.name == 'body') && fixingBlock &&
(!parent.name || dtd[parent.name][fixingBlock]))
{
var name, realName;
if (node.attributes && (realName = node.attributes['data-cke-real-element-type'])) {
name = realName;
} else {
name = node.name;
}
// Text node, inline elements are subjected, except for <script>/<style>.
return name && name in dtd.$inline &&
!(name in dtd.head) &&
!node.isOrphan ||
node.type == NODE_TEXT;
}
}
// Judge whether two element tag names are likely the siblings from the same
// structural element.
function possiblySibling(tag1, tag2) {
if (tag1 in dtd.$listItem || tag1 in dtd.$tableContent)
return tag1 == tag2 || tag1 == 'dt' && tag2 == 'dd' || tag1 == 'dd' && tag2 == 'dt';
return false;
}
parser.onTagOpen = function(tagName, attributes, selfClosing, optionalClose) {
var element = new Parser.Element(tagName, attributes);
// "isEmpty" will be always "false" for unknown elements, so we
// must force it if the parser has identified it as a selfClosing tag.
if (element.isUnknown && selfClosing)
element.isEmpty = true;
// Check for optional closed elements, including browser quirks and manually opened blocks.
element.isOptionalClose = optionalClose;
// This is a tag to be removed if empty, so do not add it immediately.
if (isRemoveEmpty(element)) {
pendingInline.push(element);
return;
} else
if (tagName == 'pre') {
inPre = true;
} else
if (tagName == 'textarea') {
inTextarea = true;
}
if (tagName == 'br') {
pendingBRs.push(element);
return;
}
while (1) {
var currentName = currentNode.name;
var currentDtd = currentName ? (dtd[currentName] || (currentNode._.isBlockLike ? dtd.div : dtd.span)) : rootDtd;
// If the element cannot be child of the current element.
if (!element.isUnknown && !currentNode.isUnknown && !currentDtd[tagName]) {
// Current node doesn't have a close tag, time for a close
// as this element isn't fit in. (#7497)
if (currentNode.isOptionalClose) {
parser.onTagClose(currentName);
} else
// Fixing malformed nested lists by moving it into a previous list item. (#3828)
if (tagName in listBlocks && currentName in listBlocks) {
var children = currentNode.children;
var lastChild = children[children.length - 1];
// Establish the list item if it's not existed.
if (!(lastChild && lastChild.name == 'li'))
addElement((lastChild = new Parser.Element('li')), currentNode);
!element.returnPoint && (element.returnPoint = currentNode);
currentNode = lastChild;
} else
// Establish new list root for orphan list items, but NOT to create
// new list for the following ones, fix them instead. (#6975)
// <dl><dt>foo<dd>bar</dl>
// <ul><li>foo<li>bar</ul>
if (tagName in dtd.$listItem && !possiblySibling(tagName, currentName)) {
parser.onTagOpen(tagName == 'li' ? 'ul' : 'dl', {}, 0, 1);
} else
// We're inside a structural block like table and list, AND the incoming element
// is not of the same type (e.g. <td>td1<td>td2</td>), we simply add this new one before it,
// and most importantly, return back to here once this element is added,
// e.g. <table><tr><td>td1</td><p>p1</p><td>td2</td></tr></table>
if (currentName in nonBreakingBlocks && !possiblySibling(tagName, currentName)) {
!element.returnPoint && (element.returnPoint = currentNode);
currentNode = currentNode.parent;
} else {
// The current element is an inline element, which
// need to be continued even after the close, so put
// it in the pending list.
if (currentName in dtd.$inline)
pendingInline.unshift(currentNode);
// The most common case where we just need to close the
// current one and append the new one to the parent.
if (currentNode.parent) {
addElement(currentNode, currentNode.parent, 1);
} else {
// We've tried our best to fix the embarrassment here, while
// this element still doesn't find it's parent, mark it as
// orphan and show our tolerance to it.
element.isOrphan = 1;
break;
}
}
} else {
break;
}
}
checkPending(tagName);
sendPendingBRs();
element.parent = currentNode;
if (element.isEmpty) {
addElement(element);
} else {
currentNode = element;
}
};
parser.onTagClose = function(tagName) {
// Check if there is any pending tag to be closed.
for (var i = pendingInline.length - 1; i >= 0; i--) {
// If found, just remove it from the list.
if (tagName == pendingInline[i].name) {
pendingInline.splice(i, 1);
return;
}
}
var pendingAdd = [];
var newPendingInline = [];
var candidate = currentNode;
while (candidate != root && candidate.name != tagName) {
// If this is an inline element, add it to the pending list, if we're
// really closing one of the parents element later, they will continue
// after it.
if (!candidate._.isBlockLike)
newPendingInline.unshift(candidate);
// This node should be added to it's parent at this point. But,
// it should happen only if the closing tag is really closing
// one of the nodes. So, for now, we just cache it.
pendingAdd.push(candidate);
// Make sure return point is properly restored.
candidate = candidate.returnPoint || candidate.parent;
}
if (candidate != root) {
// Add all elements that have been found in the above loop.
for (i = 0; i < pendingAdd.length; i++) {
var node = pendingAdd[i];
addElement(node, node.parent);
}
currentNode = candidate;
if (candidate._.isBlockLike)
sendPendingBRs();
addElement(candidate, candidate.parent);
// The parent should start receiving new nodes now, except if
// addElement changed the currentNode.
if (candidate == currentNode)
currentNode = currentNode.parent;
pendingInline = pendingInline.concat(newPendingInline);
}
if (tagName == 'body')
fixingBlock = false;
};
parser.onText = function(text) {
var currentName = currentNode.name;
var currentDtd = currentName ? (dtd[currentName] || (currentNode._.isBlockLike ? dtd.div : dtd.span)) : rootDtd;
var isWhitespace = !text.match(/[^ \t\n\r]/);
// Fix orphan text in list/table. (#8540) (#8870)
if (!isWhitespace && !inTextarea && !currentDtd['#'] && currentName in nonBreakingBlocks) {
parser.onTagOpen(currentName in listBlocks ? 'li' : currentName == 'dl' ? 'dd' : currentName == 'table' ? 'tr' : currentName == 'tr' ? 'td' : '');
parser.onText(text);
return;
}
sendPendingBRs();
checkPending();
text = new Parser.Text(text);
if (checkAutoParagraphing(currentNode, text))
this.onTagOpen(fixingBlock, {}, 0, 1);
currentNode.add(text);
};
parser.onCDATA = function(cdata) {
currentNode.add(new Parser.CData(cdata));
};
parser.onComment = function(comment) {
sendPendingBRs();
checkPending();
currentNode.add(new Parser.Comment(comment));
};
parser.onDoctype = function(value) {
sendPendingBRs();
checkPending();
root.add(new Parser.Doctype(value));
root.doctype = value;
};
parser.onXmlDecl = function(value) {
sendPendingBRs();
checkPending();
root.add(new Parser.XmlDecl(value));
root.xmlDecl = value;
};
// Parse it.
parser.parse(fragmentHtml);
// Close all pending nodes, make sure return point is properly restored.
while (currentNode != root)
addElement(currentNode, currentNode.parent, 1);
return root;
};
Parser.Fragment.prototype = {
/**
* The node type. This is a constant value set to {@link NODE_DOCUMENT_FRAGMENT}.
*
* @readonly
* @property {Number} [=NODE_DOCUMENT_FRAGMENT]
*/
type: NODE_DOCUMENT_FRAGMENT,
/**
* Adds a node to this fragment.
*
* @param {Parser.Node} node The node to be added.
* @param {Number} [index] From where the insertion happens.
*/
add: function(node, index) {
isNaN(index) && (index = this.children.length);
node.previous = index > 0 ? this.children[index - 1] : null;
node.parent = this;
this.children.splice(index, 0, node);
if (!this._.hasInlineStarted)
this._.hasInlineStarted = node.type == NODE_TEXT || (node.type == NODE_ELEMENT && !node._.isBlockLike);
},
/**
* Render the fragment and return the HTML.
*
* var fragment = Parser.Fragment.fromHtml('<P><B>Example');
* alert(fragment.getHtml()); // '<p><b>Example</b></p>'
*
*/
getHtml: function() {
var doctype = this.doctype || '';
var xhtml = doctype.match(/xhtml/i);
var writer = new Parser.BasicWriter({xhtml: xhtml});
this.writeHtml(writer);
return writer.getHtml(true);
},
/**
* Writes the fragment HTML to a {@link Parser.BasicWriter}.
*
* var writer = new htmlWriter();
* var fragment = Parser.Fragment.fromHtml('<P><B>Example');
* fragment.writeHtml(writer);
* alert(writer.getHtml()); // '<p><b>Example</b></p>'
*
* @param {Parser.BasicWriter} writer The writer to which write the HTML.
*/
writeHtml: function(writer) {
this.writeChildrenHtml(writer);
},
/**
* Write and filtering the child nodes of this fragment.
*
* @param {Parser.BasicWriter} writer The writer to which write the HTML.
*/
writeChildrenHtml: function(writer) {
for (var i = 0, children = this.children, l = children.length; i < l; i++)
children[i].writeHtml(writer);
},
/**
* Execute callback on each node (of given type) in this document fragment.
*
* var fragment = Parser.Fragment.fromHtml('<p>foo<b>bar</b>bom</p>');
* fragment.forEach(function(node) {
* console.log(node);
* });
* // Will log:
* // 1. document fragment,
* // 2. <p> element,
* // 3. "foo" text node,
* // 4. <b> element,
* // 5. "bar" text node,
* // 6. "bom" text node.
*
* @since 4.1
* @param {Function} callback Function to be executed on every node.
* @param {Parser.Node} callback.node Node passed as argument.
* @param {Number} [type] If specified `callback` will be executed only on nodes of this type.
* @param {Boolean} [skipRoot] Don't execute `callback` on this fragment.
*/
forEach: function(callback, type, skipRoot) {
if (!skipRoot && (!type || this.type == type))
callback(this);
var children = this.children;
var node;
var i = 0;
var l = children.length;
for (; i < l; i++) {
node = children[i];
if (node.type == NODE_ELEMENT) {
node.forEach(callback, type);
} else
if (!type || node.type == type) {
callback(node);
}
}
}
};
})();
/** @class Parser.Element */
(function() {
// Used to sort attribute entries in an array, where the first element of
// each object is the attribute name.
var sortAttribs = function(a, b) {
a = a[0];
b = b[0];
return a < b ? -1 : a > b ? 1 : 0;
},
fragProto = Parser.Fragment.prototype;
Parser.Element.prototype = tools.extend(new Parser.Node(), {
/**
* The node type. This is a constant value set to {@link NODE_ELEMENT}.
*
* @readonly
* @property {Number} [=NODE_ELEMENT]
*/
type: NODE_ELEMENT,
/**
* Adds a node to the element children list.
*
* @method
* @param {Parser.Node} node The node to be added.
* @param {Number} [index] From where the insertion happens.
*/
add: fragProto.add,
/**
* Clone this element.
*
* @returns {Parser.Element} The element clone.
*/
clone: function() {
return new Parser.Element(this.name, this.attributes);
},
/**
* Writes the element HTML to a htmlWriter.
*
* @param {Parser.BasicWriter} writer The writer to which write the HTML.
*/
writeHtml: function(writer) {
var name = this.name;
var attribsArray = [];
var attributes = this.attributes;
var attrName;
var attr, i, l;
// Open element tag.
writer.openTag(name, attributes);
// Copy all attributes to an array.
for (attrName in attributes)
attribsArray.push([attrName, attributes[attrName]]);
// Sort the attributes by name.
if (writer.sortAttributes)
attribsArray.sort(sortAttribs);
// Send the attributes.
for (i = 0, l = attribsArray.length; i < l; i++) {
attr = attribsArray[i];
writer.attribute(attr[0], attr[1]);
}
// Close the tag.
writer.openTagClose(name, this.isEmpty);
this.writeChildrenHtml(writer);
// Close the element.
if (!this.isEmpty)
writer.closeTag(name);
},
/**
* Send children of this element to the writer.
*
* @param {Parser.BasicWriter} writer The writer to which write the HTML.
*/
writeChildrenHtml: fragProto.writeChildrenHtml,
/**
* Replace this element with its children.
*
* @since 4.1
*/
replaceWithChildren: function() {
var children = this.children;
for (var i = children.length; i;)
children[--i].insertAfter(this);
this.remove();
},
/**
* Execute callback on each node (of given type) in this element.
*
* // Create <p> element with foo<b>bar</b>bom as its content.
* var elP = Parser.Fragment.fromHtml('foo<b>bar</b>bom', 'p');
* elP.forEach(function(node) {
* console.log(node);
* });
* // Will log:
* // 1. document fragment,
* // 2. <p> element,
* // 3. "foo" text node,
* // 4. <b> element,
* // 5. "bar" text node,
* // 6. "bom" text node.
*
* @since 4.1
* @param {Function} callback Function to be executed on every node.
* @param {Parser.Node} callback.node Node passed as argument.
* @param {Number} [type] If specified `callback` will be executed only on nodes of this type.
* @param {Boolean} [skipRoot] Don't execute `callback` on this element.
*/
forEach: fragProto.forEach
});
})();
/**
* A lightweight representation of HTML text.
*
* @class
* @extends Parser.Node
* @constructor Creates a text class instance.
* @param {String} value The text node value.
*/
Parser.Text = function Text(value) {
/**
* The text value.
*
* @property {String}
*/
this.value = value;
/** @private */
this._ = {
isBlockLike: false
};
};
Parser.Text.prototype = tools.extend(new Parser.Node(), {
/**
* The node type. This is a constant value set to {@link NODE_TEXT}.
*
* @readonly
* @property {Number} [=NODE_TEXT]
*/
type: NODE_TEXT,
/**
* Writes the HTML representation of this text to a {Parser.BasicWriter}.
*
* @param {Parser.BasicWriter} writer The writer to which write the HTML.
*/
writeHtml: function(writer) {
writer.text(this.value);
}
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment