Created
March 10, 2014 20:46
-
-
Save ilikenwf/9473950 to your computer and use it in GitHub Desktop.
Javascript Based MS Word/Excel/Outlook/etc HTML Cleaner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//the below is amalgamated from https://groups.google.com/forum/#!topic/cleditor/WvU-RIMorp4 | |
//and http://blog.tatedavies.com/2012/08/28/replace-microsoft-chars-in-javascript/ | |
//with some modification | |
function cleanMSJunk(html) { | |
var ary = html.split("<"); | |
var end = -1; | |
for (var i=0; i<ary.length; i++) { | |
if (ary[i].lastIndexOf("!--[if ", 7) === 0) { // handle Microsoft <!--[if ... <![endif]--> | |
ary[i] = ""; | |
var found = false; | |
for (i++; i<ary.length; i++) { | |
if (ary[i].lastIndexOf("![endif]-->", 11) === 0) {found = true;} | |
ary[i] = ""; | |
if (found) break; | |
} | |
if (i>=ary.length) break; | |
} | |
end = ary[i].indexOf(">"); | |
if (end == -1) continue; | |
ary[i] = ary[i].substring(0,end).toLowerCase()+ary[i].substring(end); | |
var search = ["strong>","em>","strike>","u>","br>"]; | |
var replace = ["b>","i>","del>","ins>","br/>"]; | |
for (var j=0; j<search.length; j++) { | |
var pos = ary[i].lastIndexOf(search[j], search[j].length+1); | |
if (pos == 0 || (pos == 1 && ary[i].charAt(0) == '/')) { | |
ary[i] = (pos == 1 ? "/" : "")+ replace[j] +ary[i].substring(search[j].length+pos); | |
} | |
} | |
//put any tags you don't want the attributes stripped from in the var below, ie to add "ul" | |
//you would insert it before span, with a pipe after so it'd look like /^\/?(ul|span|... | |
var spellcheckerRE = /^\/?(span|font|h1|h2|h3|h4|h5|u|b|i|table|tr|td|p|th|tbody|thead|caption)[^\/>]*\/?>/m; | |
var cleanupRE = /^(\/?)(br|del|ins|i|li|ol|ul|ul|li|ol|dl|dt|dd)[^\/>]*(\/?)>/m; | |
if (spellcheckerRE.test(ary[i])) { | |
ary[i] = '<'+ary[i]; | |
} else if (cleanupRE.test(ary[i])) { | |
ary[i] = ary[i].replace(cleanupRE, "<$1$2$3>"); | |
ary[i] = ary[i].replace(/^<p>/, ""); | |
} else { | |
ary[i] = end+1 < ary[i].length ? ary[i].substring(end+1) : ""; | |
} | |
ary[i] = ary[i].replace(/\n\n/gm, "<br>"); | |
} | |
html = ary.join(""); | |
var trimRE = /^(\s+| |<br\/?>|<p>( )*<\/p>)+/m; | |
if (trimRE.test(html)) { | |
html = html.replace(trimRE, ""); | |
} | |
var body = document.getElementsByTagName("body")[0]; | |
var div = document.createElement("div"); | |
div.style.display = "none"; | |
body.appendChild(div); | |
div.innerHTML = html; | |
var text = div.innerText || div.textContent; | |
body.removeChild(div); | |
var trimRE = /\S/m; | |
if (!trimRE.test(text)) html = ""; | |
html = html.replace(/[\u2018|\u2019|\u201A]/g, "\'"); | |
html = html.replace(/[\u201C|\u201D|\u201E]/g, "\""); | |
html = html.replace(/\u2026/g, "..."); | |
html = html.replace(/[\u2013|\u2014]/g, "-"); | |
html = html.replace(/\u02C6/g, "^"); | |
html = html.replace(/\u2039/g, ""); | |
html = html.replace(/[\u02DC|\u00A0]/g, " "); | |
return html; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment