Skip to content

Instantly share code, notes, and snippets.

@ilikenwf
Created March 10, 2014 20:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ilikenwf/9473950 to your computer and use it in GitHub Desktop.
Save ilikenwf/9473950 to your computer and use it in GitHub Desktop.
Javascript Based MS Word/Excel/Outlook/etc HTML Cleaner
//the below is amalgamated from https://groups.google.com/forum/#!topic/cleditor/WvU-RIMorp4
//and http://blog.tatedavies.com/2012/08/28/replace-microsoft-chars-in-javascript/
//with some modification
function cleanMSJunk(html) {
var ary = html.split("<");
var end = -1;
for (var i=0; i<ary.length; i++) {
if (ary[i].lastIndexOf("!--[if ", 7) === 0) { // handle Microsoft <!--[if ... <![endif]-->
ary[i] = "";
var found = false;
for (i++; i<ary.length; i++) {
if (ary[i].lastIndexOf("![endif]-->", 11) === 0) {found = true;}
ary[i] = "";
if (found) break;
}
if (i>=ary.length) break;
}
end = ary[i].indexOf(">");
if (end == -1) continue;
ary[i] = ary[i].substring(0,end).toLowerCase()+ary[i].substring(end);
var search = ["strong>","em>","strike>","u>","br>"];
var replace = ["b>","i>","del>","ins>","br/>"];
for (var j=0; j<search.length; j++) {
var pos = ary[i].lastIndexOf(search[j], search[j].length+1);
if (pos == 0 || (pos == 1 && ary[i].charAt(0) == '/')) {
ary[i] = (pos == 1 ? "/" : "")+ replace[j] +ary[i].substring(search[j].length+pos);
}
}
//put any tags you don't want the attributes stripped from in the var below, ie to add "ul"
//you would insert it before span, with a pipe after so it'd look like /^\/?(ul|span|...
var spellcheckerRE = /^\/?(span|font|h1|h2|h3|h4|h5|u|b|i|table|tr|td|p|th|tbody|thead|caption)[^\/>]*\/?>/m;
var cleanupRE = /^(\/?)(br|del|ins|i|li|ol|ul|ul|li|ol|dl|dt|dd)[^\/>]*(\/?)>/m;
if (spellcheckerRE.test(ary[i])) {
ary[i] = '<'+ary[i];
} else if (cleanupRE.test(ary[i])) {
ary[i] = ary[i].replace(cleanupRE, "<$1$2$3>");
ary[i] = ary[i].replace(/^<p>/, "");
} else {
ary[i] = end+1 < ary[i].length ? ary[i].substring(end+1) : "";
}
ary[i] = ary[i].replace(/\n\n/gm, "<br>");
}
html = ary.join("");
var trimRE = /^(\s+|&nbsp;|<br\/?>|<p>(&nbsp;)*<\/p>)+/m;
if (trimRE.test(html)) {
html = html.replace(trimRE, "");
}
var body = document.getElementsByTagName("body")[0];
var div = document.createElement("div");
div.style.display = "none";
body.appendChild(div);
div.innerHTML = html;
var text = div.innerText || div.textContent;
body.removeChild(div);
var trimRE = /\S/m;
if (!trimRE.test(text)) html = "";
html = html.replace(/[\u2018|\u2019|\u201A]/g, "\'");
html = html.replace(/[\u201C|\u201D|\u201E]/g, "\"");
html = html.replace(/\u2026/g, "...");
html = html.replace(/[\u2013|\u2014]/g, "-");
html = html.replace(/\u02C6/g, "^");
html = html.replace(/\u2039/g, "");
html = html.replace(/[\u02DC|\u00A0]/g, " ");
return html;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment