public
Last active

jQuery htmlDoc "fixer" - get HTML, HEAD, BODY in your $(html) - NEEDS TESTING

  • Download Gist
jquery.ba-htmldoc.js
JavaScript
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
/*!
* jQuery htmlDoc "fixer" - v0.2pre - 8/8/2011
* http://benalman.com/projects/jquery-misc-plugins/
*
* Copyright (c) 2010 "Cowboy" Ben Alman
* Dual licensed under the MIT and GPL licenses.
* http://benalman.com/about/license/
*/
 
(function($) {
// RegExp that matches opening and closing browser-stripped tags.
// $1 = slash, $2 = tag name, $3 = attributes
var matchTag = /<(\/?)(html|head|body|title|base|meta)(\s+[^>]*)?>/ig;
// Unique id prefix for selecting placeholder elements.
var prefix = 'hd' + +new Date;
// A node under which a temporary DOM tree can be constructed.
var parent;
 
$.htmlDoc = function(html) {
// A collection of "intended" elements that can't be rendered cross-browser
// with .innerHTML, for which placeholders must be swapped.
var elems = $();
// Input HTML string, parsed to include placeholder DIVs. Replace HTML,
// HEAD, BODY tags with DIV placeholders.
var htmlParsed = html.replace(matchTag, function(tag, slash, name, attrs) {
// Temporary object in which to hold attributes.
var obj = {};
// If this is an opening tag...
if ( !slash ) {
// Add an element of this name into the collection of elements. Note
// that if a string of attributes is added at this point, it fails.
elems = elems.add('<' + name + '/>');
// If the original tag had attributes, create a temporary div with
// those attributes. Then, copy each attribute from the temporary div
// over to the temporary object.
if ( attrs ) {
$.each($('<div' + attrs + '/>')[0].attributes, function(i, attr) {
obj[attr.name] = attr.value;
});
}
// Set the attributes of the intended object based on the attributes
// copied in the previous step.
elems.eq(-1).attr(obj);
}
// A placeholder div with a unique id replaces the intended element's
// tag in the parsed HTML string.
return '<' + slash + 'div'
+ (slash ? '' : ' id="' + prefix + (elems.length - 1) + '"') + '>';
});
 
// If no placeholder elements were necessary, just return normal
// jQuery-parsed HTML.
if ( !elems.length ) {
return $(html);
}
// Create parent node if it hasn't been created yet.
if ( !parent ) {
parent = $('<div/>');
}
// Create the parent node and append the parsed, place-held HTML.
parent.html(htmlParsed);
// Replace each placeholder element with its intended element.
$.each(elems, function(i) {
var elem = parent.find('#' + prefix + i).before(elems[i]);
elems.eq(i).html(elem.contents());
elem.remove();
});
// Return the topmost intended element(s), sans text nodes, while removing
// them from the parent element with unwrap.
return parent.children().unwrap();
};
 
}(jQuery));
readme.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
From the jQuery API docs for .load():
 
jQuery uses the browser's .innerHTML property to parse the retrieved
document and insert it into the current document. During this process,
browsers often filter elements from the document such as <html>,
<title>, or <head> elements. As a result, the elements retrieved by
.load() may not be exactly the same as if the document were retrieved
directly by the browser.
 
Using jQuery, and given this test.html:
 
<!DOCTYPE HTML>
<html lang="en-US">
<head>
<title>Test page</title>
</head>
<body>
<div id="content">
<p>stuff</p>
<p>more stuff</p>
</div>
</body>
</html>
 
This behavior can be seen:
 
$.get( 'test.html', function( html ) {
// Not great: [, <title>​Test page​</title>, , <div id=​"content">​…​</div>, ]
console.log( $(html) );
// This fails: []
console.log( $(html).find( '#content') );
// This selects the content div, but.. ugly.
console.log( $(html).filter( '#content') );
// This also selects the content div, but.. also ugly.
console.log( $('<div/>').html( html ).find( '#content' ) );
});
 
This, on the other hand, works as you'd expect, and attributes should
be properly preserved:
 
$.get( 'test.html', function( html ) {
var hd = $.htmlDoc( html );
console.log( hd.filter( 'html' ).length ); // 1
console.log( hd.filter( 'html' ).attr( 'lang' ) ); // "en-US"
console.log( hd.find( 'head' ).length ); // 1
console.log( hd.find( 'body' ).length ); // 1
});

I ran some tests.

With $(htmlstr) it looks like every browser strips out BODY, HEAD. All but Opera strip out HTML. Many strip out TITLE, BASE, META. Some strip out KEYGEN, PROGRESS, SOURCE.

it's a great solution! thank you.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.