Skip to content

Instantly share code, notes, and snippets.

@dvdbng
Last active November 27, 2018 13:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dvdbng/e2ec950ff63983932b7b19e0c8f273ac to your computer and use it in GitHub Desktop.
Save dvdbng/e2ec950ff63983932b7b19e0c8f273ac to your computer and use it in GitHub Desktop.
Download page DOM with CSS and images
/*
Note: Due to CORS, this only works in chromium with disabled web security.
Start chrome like this: chromium-browser --disable-web-security --user-data-dir=/tmp/chrometmp
Load the page you want and paste the script in the web console.
This will save a snapshoot of the DOM and inline all CSS and images so that usually the page will
look exactly the same, but there is no javascript in the page.
Note that the generated file can be large in size, because the same URL might be inlined more than once.
*/
var CSS_IMPORT = /@import\s*["']([^"']+)["']/g
var CSS_URL = /\burl\(("[^"]+"|'[^']+'|[^"')][^)]+)\)/g
var BAD_CSS = /(-moz-binding|expression\s*\(|javascript\s*:)/gi
var URL = (window.URL || window.webkitURL)
function loadJszip() {
return new Promise(function(resolve, reject) {
window.define = null;
if (window.JSZip) return resolve();
var imported = document.createElement('script');
imported.src = 'https://unpkg.com/jszip@3.1.5/dist/jszip.min.js';
document.head.appendChild(imported);
setTimeout(function check() {
console.log('Check for jszip');
if (window.JSZip) return resolve();
setTimeout(check, 50);
}, 50);
});
}
var URL_ATTRIBUTES = {
img_src: true,
link_href: true,
input_src: true,
body_background: true,
table_background: true,
td_background: true,
tr_background: true,
th_background: true,
tbody_background: true,
thead_background: true,
tfoot_background: true,
col_background: true,
colgroup_background: true,
section_background: true,
head_profile: true,
html_manifest: true,
command_icon: true,
embed_src: true,
object_data: true,
video_poster: true,
};
BLOCKED_ATTRIBUTES = {
iframe_src: true,
script_src: true,
img_srcset: true,
}
function isUrlAttribute(tagName, attribute) {
var key = tagName.toLowerCase() + '_' + attribute.toLowerCase();
return URL_ATTRIBUTES[key] || false;
}
function isBlockedAttribute(tagName, attribute) {
var key = tagName.toLowerCase() + '_' + attribute.toLowerCase();
return BLOCKED_ATTRIBUTES[key] || false;
}
function downloadPage() {
var resources = {}; // content -> file
var resource_index = 0;
function getFileName(mime) {
if (/^text\/(css|html)/.test(mime)) return `${++resource_index}.${RegExp.$1}`;
if (/^image\/(gif|jpeg|png|svg)/.test(mime)) return `${++resource_index}.${RegExp.$1}`;
if (/^text\/plain/.test(mime)) return `${++resource_index}.txt`;
if (/^image\/x-icon/.test(mime)) return `${++resource_index}.ico`;
return `${++resource_index}.bin`;
}
async function processCss(css_source, base_uri) {
if (!css_source) return '';
function unscape(str) {
return str.replace(/\\([a-f0-9]+) ?/gi, function(str, charcode) {
return String.fromCharCode(parseInt(charcode, 16));
});
}
function unquote(str) {
return str.replace(/^["']+/g, '').replace(/["']+$/g, '');
}
let contents = {};
css_source.replace(CSS_IMPORT, (match, url) => contents[unscape(url)] = null)
.replace(CSS_URL, (match, url) => contents[unscape(unquote(url))] = null);
let urls = Object.keys(contents);
(await Promise.all(Object.keys(contents).map(url => inlineUrl(url, base_uri)))).forEach((resp, i) => {
contents[urls[i]] = resp;
});
css_source = css_source.replace(CSS_IMPORT, (match, url) => `@import "${contents[unscape(url)]}"`);
css_source = css_source.replace(CSS_URL, (match, url) => `url("${contents[unscape(unquote(url))]}")`);
css_source = css_source.replace(BAD_CSS, 'blocked');
return css_source
}
async function blobToDataURL(blob) {
return new Promise(function(resolve, reject) {
var a = new FileReader();
a.onload = function(e) { resolve(e.target.result); }
a.readAsDataURL(blob);
});
}
async function inlineUrl(url, baseurl) {
const resolved = new URL(url, baseurl).href;
if (/^data:/i.test(resolved)) { return resolved; }
if (inlineUrl.cache[resolved]) {
return inlineUrl.cache[resolved];
}
const resp = await fetch(resolved);
let data;
if (/^text\/css/.test(resp.headers.get('Content-Type'))) {
data = await processCss(await resp.text(), resolved);
} else {
data = await resp.blob();
}
var file_name = getFileName(resp.headers.get('Content-Type'))
resources[file_name] = data;
inlineUrl.cache[resolved] = file_name;
return file_name;
}
inlineUrl.cache = {};
async function visitNode(elm) {
switch (elm.nodeType) {
case Node.TEXT_NODE:
if (elm.parentNode && elm.parentNode.tagName == 'STYLE') {
elm.textContent = await processCss(elm.textContent, elm.baseURI);
} else if (elm.parentNode && elm.parentNode.tagName == 'SCRIPT') {
elm.textContent = '';
}
break;
case Node.ELEMENT_NODE:
for (var i = 0; i < elm.attributes.length; i++) {
var attr = elm.attributes[i];
if(attr.name == "style"){
attr.value = await processCss(attr.value, elm.baseURI);
} else if (isUrlAttribute(elm.tagName, attr.name)){
attr.value = await inlineUrl(attr.value, elm.baseURI);
} else if (/^on/.test(attr.name) || isBlockedAttribute(elm.tagName, attr.name)) {
attr.value = '';
}
}
let promises = [];
for (var child = elm.firstChild; child; child = child.nextSibling) {
promises.push(visitNode(child));
}
await Promise.all(promises);
break;
}
}
function download(filename, blob) {
//let blob = new Blob([text], {type: 'text/html'});
var element = document.createElement('a');
element.setAttribute('href', URL.createObjectURL(blob));
element.setAttribute('download', filename);
element.style.display = 'none';
document.body.appendChild(element);
element.click();
}
async function getInline(elm) {
const clone = elm.cloneNode(true);
await Promise.all([
visitNode(clone),
loadJszip(),
]);
var zip = new JSZip();
var doctype = document.doctype;
var doctype_str = "<!DOCTYPE "
+ doctype.name
+ (doctype.publicId ? ' PUBLIC "' + doctype.publicId + '"' : '')
+ (!doctype.publicId && doctype.systemId ? ' SYSTEM' : '')
+ (doctype.systemId ? ' "' + doctype.systemId + '"' : '')
+ '>';
zip.file("index.html", doctype_str + '\n' + clone.outerHTML);
Object.entries(resources).forEach(([file, data]) => {
zip.file(file, data);
})
var content = await zip.generateAsync({type:"blob"});
var zip_name = (document.title || location.href).replace(/[^a-z0-9]/gi, '') + '.zip'
download(zip_name, content);
}
getInline(document.documentElement);
}
downloadPage();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment