Created
September 22, 2010 10:12
-
-
Save Mistat/591455 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Customized by Misato Takahashi <misato@takahashi.name> | |
* - fix if Html document has "<!DOCTYPE>" then parse error. | |
* - fix if Attribute name includes "-" then parse error | |
* - fix if Unmatch case start tag and end tag then parse error | |
* - add function "getElementById" | |
* | |
* HTML Parser By John Resig (ejohn.org) | |
* Original code by Erik Arvidsson, Mozilla Public License | |
* http://erik.eae.net/simplehtmlparser/simplehtmlparser.js | |
* | |
* // Use like so: | |
* HTMLParser(htmlString, { | |
* start: function(tag, attrs, unary) {}, | |
* end: function(tag) {}, | |
* chars: function(text) {}, | |
* comment: function(text) {} | |
* }); | |
* | |
* // or to get an XML string: | |
* HTMLtoXML(htmlString); | |
* | |
* // or to get an XML DOM Document | |
* var dom = HTMLtoDOM(htmlString); | |
* | |
* dom.getElementById('id'); | |
* dom.getElementsByTagName('name'); | |
* | |
* // or to inject into an existing document/DOM node | |
* HTMLtoDOM(htmlString, document); | |
* HTMLtoDOM(htmlString, document.body); | |
* | |
*/ | |
(function(){ | |
// Regular Expressions for parsing tags and attributes | |
var startTag = /^<(\w+)((?:\s+[\w\-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/, | |
endTag = /^<\/(\w+)[^>]*>/, | |
attr = /(\w+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g; | |
// Empty Elements - HTML 4.01 | |
var empty = makeMap("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed"); | |
// Block Elements - HTML 4.01 | |
var block = makeMap("address,applet,blockquote,button,center,dd,del,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul"); | |
// Inline Elements - HTML 4.01 | |
var inline = makeMap("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var"); | |
// Elements that you can, intentionally, leave open | |
// (and which close themselves) | |
var closeSelf = makeMap("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr"); | |
// Attributes that have their values filled in disabled="disabled" | |
var fillAttrs = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); | |
// Special Elements (can contain anything) | |
var special = makeMap("script,style"); | |
var HTMLParser = this.HTMLParser = function( html, handler ) { | |
var index, chars, match, stack = [], last = html; | |
stack.last = function(){ | |
return this[ this.length - 1 ]; | |
}; | |
while ( html ) { | |
chars = true; | |
// Make sure we're not in a script or style element | |
if ( !stack.last() || !special[ stack.last() ] ) { | |
// Doctype | |
if ( html.toLowerCase().indexOf('<!doctype') == 0 ) { | |
index = html.indexOf('>'); | |
if (index >= 0) { | |
if ( handler.comment ) | |
handler.comment( html.substring( 2, index ) ); | |
html = html.substring( index + 1 ); | |
chars = false; | |
} | |
} else | |
// Comment | |
if ( html.indexOf("<!--") == 0 ) { | |
index = html.indexOf("-->"); | |
if ( index >= 0 ) { | |
if ( handler.comment ) | |
handler.comment( html.substring( 4, index ) ); | |
html = html.substring( index + 3 ); | |
chars = false; | |
} | |
// end tag | |
} else if ( html.indexOf("</") == 0 ) { | |
match = html.match( endTag ); | |
if ( match ) { | |
html = html.substring( match[0].length ); | |
match[0].replace( endTag, parseEndTag ); | |
chars = false; | |
} | |
// start tag | |
} else if ( html.indexOf("<") == 0 ) { | |
match = html.match( startTag ); | |
if ( match ) { | |
html = html.substring( match[0].length ); | |
match[0].replace( startTag, parseStartTag ); | |
chars = false; | |
} | |
} | |
if ( chars ) { | |
index = html.indexOf("<"); | |
var text = index < 0 ? html : html.substring( 0, index ); | |
html = index < 0 ? "" : html.substring( index ); | |
if ( handler.chars ) | |
handler.chars( text ); | |
} | |
} else { | |
html = html.replace(new RegExp("(.*)<\/" + stack.last() + "[^>]*>", "i"), function(all, text){ | |
text = text.replace(/<!--(.*?)-->/g, "$1") | |
.replace(/<!\[CDATA\[(.*?)]]>/g, "$1"); | |
if ( handler.chars ) | |
handler.chars( text ); | |
return ""; | |
}); | |
parseEndTag( "", stack.last() ); | |
} | |
if ( html == last ) | |
throw "Parse Error: " + html; | |
last = html; | |
} | |
// Clean up any remaining tags | |
parseEndTag(); | |
function parseStartTag( tag, tagName, rest, unary ) { | |
tagName = tagName.toLowerCase(); | |
if ( block[ tagName ] ) { | |
while ( stack.last() && inline[ stack.last() ] ) { | |
parseEndTag( "", stack.last() ); | |
} | |
} | |
if ( closeSelf[ tagName ] && stack.last() == tagName ) { | |
parseEndTag( "", tagName ); | |
} | |
unary = empty[ tagName ] || !!unary; | |
if ( !unary ) | |
stack.push( tagName ); | |
if ( handler.start ) { | |
var attrs = []; | |
rest.replace(attr, function(match, name) { | |
var value = arguments[2] ? arguments[2] : | |
arguments[3] ? arguments[3] : | |
arguments[4] ? arguments[4] : | |
fillAttrs[name] ? name : ""; | |
attrs.push({ | |
name: name, | |
value: value, | |
escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //" | |
}); | |
}); | |
if ( handler.start ) | |
handler.start( tagName, attrs, unary ); | |
} | |
} | |
function parseEndTag( tag, tagName ) { | |
// If no tag name is provided, clean shop | |
if ( !tagName ) | |
var pos = 0; | |
// Find the closest opened tag of the same type | |
else | |
tagName = tagName.toLowerCase(); | |
for ( var pos = stack.length - 1; pos >= 0; pos-- ) | |
if ( stack[ pos ] == tagName ) | |
break; | |
if ( pos >= 0 ) { | |
// Close all the open elements, up the stack | |
for ( var i = stack.length - 1; i >= pos; i-- ) | |
if ( handler.end ) | |
handler.end( stack[ i ] ); | |
// Remove the open elements from the stack | |
stack.length = pos; | |
} | |
} | |
}; | |
this.HTMLtoXML = function( html ) { | |
var results = ""; | |
HTMLParser(html, { | |
start: function( tag, attrs, unary ) { | |
results += "<" + tag; | |
for ( var i = 0; i < attrs.length; i++ ) | |
results += " " + attrs[i].name + '="' + attrs[i].escaped + '"'; | |
results += (unary ? "/" : "") + ">"; | |
}, | |
end: function( tag ) { | |
results += "</" + tag + ">"; | |
}, | |
chars: function( text ) { | |
results += text; | |
}, | |
comment: function( text ) { | |
results += "<!--" + text + "-->"; | |
} | |
}); | |
return results; | |
}; | |
this.HTMLtoDOM = function( html, doc ) { | |
if ( !doc ) { | |
if ( typeof DOMDocument != "undefined" ) | |
doc = new DOMDocument(); | |
else if ( typeof document != "undefined" && document.implementation && document.implementation.createDocument ) | |
doc = document.implementation.createDocument("", "", null); | |
else if ( typeof ActiveX != "undefined" ) | |
doc = new ActiveXObject("Msxml.DOMDocument"); | |
} else | |
doc = doc.ownerDocument || | |
doc.getOwnerDocument && doc.getOwnerDocument() || | |
doc; | |
var ids = {}; | |
var elems = [], | |
documentElement = doc.documentElement || | |
doc.getDocumentElement && doc.getDocumentElement(); | |
var curParentNode = doc; | |
elems.push( doc ); | |
doc.getElementById = function (id) { | |
return ids[id]; | |
}; | |
HTMLParser( html, { | |
start: function( tagName, attrs, unary ) { | |
var elem = doc.createElement( tagName ); | |
var id; | |
for ( var attr in attrs ) { | |
elem.setAttribute( attrs[ attr ].name, attrs[ attr ].value ); | |
if (attrs[ attr ].name == 'id') id = attrs[ attr ].value; | |
} | |
if (id) ids[id] = elem; | |
if ( curParentNode && curParentNode.appendChild ) { | |
curParentNode.appendChild( elem ); | |
} | |
if ( !unary ) { | |
elems.push( elem ); | |
curParentNode = elem; | |
} | |
}, | |
end: function( tag ) { | |
elems.length -= 1; | |
// Init the new parentNode | |
curParentNode = elems[ elems.length - 1 ]; | |
}, | |
chars: function( text ) { | |
if (curParentNode == doc && text.match(/[\s\t]+/)) { | |
return; | |
} | |
if (curParentNode) | |
curParentNode.appendChild( doc.createTextNode( text ) ); | |
}, | |
comment: function( text ) { | |
// create comment node | |
} | |
}); | |
return doc; | |
}; | |
function makeMap(str){ | |
var obj = {}, items = str.split(","); | |
for ( var i = 0; i < items.length; i++ ) | |
obj[ items[i] ] = true; | |
return obj; | |
} | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment