Skip to content

Instantly share code, notes, and snippets.

@undrafted
Forked from leoherzog/*setup.md
Created March 18, 2021 14:27
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save undrafted/dc1d1512dab9c8ba94122c3760f0cc19 to your computer and use it in GitHub Desktop.
Save undrafted/dc1d1512dab9c8ba94122c3760f0cc19 to your computer and use it in GitHub Desktop.
Google Doc Clean HTML Output, powered by Apps Script

Google Doc Clean HTML Output, powered by Apps Script

Add this function to your Apps Script project and feed it an ID to a Google Doc.

If you need this on a lot of Docs, you may want to make a new Apps Script project to use as a library. Simply copy and paste this code to the new project, go to the script that's tied to your Doc, add your new Library (the script ID), and call var html = *NameOfYourLibraryProject*.getContent(DocumentApp.getActiveDocument().getId());

/**
* Takes in a Google Doc ID, gets that doc in HTML format, cleans up the markup, and returns the resulting HTML string.
*
* @param {string} the id of the google doc
* @param {boolean} [useCaching] enable or disable caching. default true.
* @return {string} the doc's body in html format
*/
function getContent(id, useCaching) {
if (!id) {
throw "Please call this API with a valid Google Doc ID";
}
if (useCaching == null) {
useCaching = true;
}
if (typeof useCaching != "boolean") {
throw "If you're going to specify useCaching, it must be boolean.";
}
var cache = CacheService.getScriptCache();
var cached = cache.get(id); // see if we have a cached version of our parsed html
if (cached && useCaching) {
var html = cached;
Logger.log("Pulling doc html from cache...");
} else {
Logger.log("Grabbing and parsing fresh html from the doc...");
try {
var doc = DriveApp.getFileById(id);
} catch (err) {
throw "Please call this API with a valid Google Doc ID. " + err.message;
}
var docName = doc.getName();
var forDriveScope = DriveApp.getStorageUsed(); // needed to get Drive Scope requested in ScriptApp.getOAuthToken();
var url = "https://docs.google.com/feeds/download/documents/export/Export?id=" + id + "&exportFormat=html";
var param = {
method: "get",
headers: {"Authorization": "Bearer " + ScriptApp.getOAuthToken()},
muteHttpExceptions:true,
};
var html = UrlFetchApp.fetch(url, param).getContentText();
// nuke the whole head section, including the stylesheet and meta tag
html = html.replace(/<head>.*<\/head>/, '');
// remove almost all html attributes
html = html.replace(/ (id|class|style|start|colspan|rowspan)="[^"]*"/g, '');
// remove all of the spans, as well as the outer html and body
html = html.replace(/<(span|\/span|body|\/body|html|\/html)>/g, '');
// clearly the superior way of denoting line breaks
html = html.replace(/<br>/g, '<br />');
cache.put(id, html, 900) // cache doc contents for 15 minutes, in case we get a lot of requests
}
Logger.log(html);
return html;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment