ilikenwf/cleanMSJunk.js

## cleanMSJunk.js
  //the below is amalgamated from https://groups.google.com/forum/#!topic/cleditor/WvU-RIMorp4
  //and http://blog.tatedavies.com/2012/08/28/replace-microsoft-chars-in-javascript/
  //with some modification

  function cleanMSJunk(html) {
    var ary = html.split("<");
    var end = -1;

    for (var i=0; i<ary.length; i++) {
     if (ary[i].lastIndexOf("!--[if ", 7) === 0) { // handle Microsoft <!--[if ... <![endif]-->
      ary[i] = "";
      var found = false;
      for (i++; i<ary.length; i++) {
       if (ary[i].lastIndexOf("![endif]-->", 11) === 0) {found = true;}
       ary[i] = "";
       if (found) break;
      }
      if (i>=ary.length) break;
     }

     end = ary[i].indexOf(">");
     if (end == -1) continue;
     ary[i] = ary[i].substring(0,end).toLowerCase()+ary[i].substring(end);

     var search = ["strong>","em>","strike>","u>","br>"];
     var replace = ["b>","i>","del>","ins>","br/>"];

     for (var j=0; j<search.length; j++) {
      var pos = ary[i].lastIndexOf(search[j], search[j].length+1);
      if (pos == 0 || (pos == 1 && ary[i].charAt(0) == '/')) {
       ary[i] = (pos == 1 ? "/" : "")+ replace[j] +ary[i].substring(search[j].length+pos);
      }
     }

     //put any tags you don't want the attributes stripped from in the var below, ie to add "ul"
     //you would insert it before span, with a pipe after so it'd look like /^\/?(ul|span|...
     var spellcheckerRE = /^\/?(span|font|h1|h2|h3|h4|h5|u|b|i|table|tr|td|p|th|tbody|thead|caption)[^\/>]*\/?>/m;
     var cleanupRE = /^(\/?)(br|del|ins|i|li|ol|ul|ul|li|ol|dl|dt|dd)[^\/>]*(\/?)>/m;
     if (spellcheckerRE.test(ary[i])) {
      ary[i] = '<'+ary[i];
     } else if (cleanupRE.test(ary[i])) {
      ary[i] = ary[i].replace(cleanupRE, "<$1$2$3>");
      ary[i] = ary[i].replace(/^<p>/, "");
     } else {
      ary[i] = end+1 < ary[i].length ? ary[i].substring(end+1) : "";
     }
     ary[i] = ary[i].replace(/\n\n/gm, "<br>");
    }

    html = ary.join("");

    var trimRE = /^(\s+|&nbsp;|<br\/?>|<p>(&nbsp;)*<\/p>)+/m;
    if (trimRE.test(html)) {
     html = html.replace(trimRE, "");
    }

    var body = document.getElementsByTagName("body")[0];
    var div = document.createElement("div");
    div.style.display = "none";
    body.appendChild(div);
    div.innerHTML = html;
    var text = div.innerText || div.textContent;
    body.removeChild(div);
    var trimRE = /\S/m;
    if (!trimRE.test(text)) html = "";

    html = html.replace(/[\u2018|\u2019|\u201A]/g, "\'");
    html = html.replace(/[\u201C|\u201D|\u201E]/g, "\"");
    html = html.replace(/\u2026/g, "...");
    html = html.replace(/[\u2013|\u2014]/g, "-");
    html = html.replace(/\u02C6/g, "^");
    html = html.replace(/\u2039/g, "");
    html = html.replace(/[\u02DC|\u00A0]/g, " ");
    return html;
  };
	//the below is amalgamated from https://groups.google.com/forum/#!topic/cleditor/WvU-RIMorp4
	//and http://blog.tatedavies.com/2012/08/28/replace-microsoft-chars-in-javascript/
	//with some modification

	function cleanMSJunk(html) {
	var ary = html.split("<");
	var end = -1;

	for (var i=0; i<ary.length; i++) {
	if (ary[i].lastIndexOf("!--[if ", 7) === 0) { // handle Microsoft <!--[if ... <![endif]-->
	ary[i] = "";
	var found = false;
	for (i++; i<ary.length; i++) {
	if (ary[i].lastIndexOf("![endif]-->", 11) === 0) {found = true;}
	ary[i] = "";
	if (found) break;
	}
	if (i>=ary.length) break;
	}

	end = ary[i].indexOf(">");
	if (end == -1) continue;
	ary[i] = ary[i].substring(0,end).toLowerCase()+ary[i].substring(end);

	var search = ["strong>","em>","strike>","u>","br>"];
	var replace = ["b>","i>","del>","ins>","br/>"];

	for (var j=0; j<search.length; j++) {
	var pos = ary[i].lastIndexOf(search[j], search[j].length+1);
	if (pos == 0 \|\| (pos == 1 && ary[i].charAt(0) == '/')) {
	ary[i] = (pos == 1 ? "/" : "")+ replace[j] +ary[i].substring(search[j].length+pos);
	}
	}

	//put any tags you don't want the attributes stripped from in the var below, ie to add "ul"
	//you would insert it before span, with a pipe after so it'd look like /^\/?(ul\|span\|...
	var spellcheckerRE = /^\/?(span\|font\|h1\|h2\|h3\|h4\|h5\|u\|b\|i\|table\|tr\|td\|p\|th\|tbody\|thead\|caption)[^\/>]*\/?>/m;
	var cleanupRE = /^(\/?)(br\|del\|ins\|i\|li\|ol\|ul\|ul\|li\|ol\|dl\|dt\|dd)[^\/>]*(\/?)>/m;
	if (spellcheckerRE.test(ary[i])) {
	ary[i] = '<'+ary[i];
	} else if (cleanupRE.test(ary[i])) {
	ary[i] = ary[i].replace(cleanupRE, "<$1$2$3>");
	ary[i] = ary[i].replace(/^<p>/, "");
	} else {
	ary[i] = end+1 < ary[i].length ? ary[i].substring(end+1) : "";
	}
	ary[i] = ary[i].replace(/\n\n/gm, "<br>");
	}

	html = ary.join("");

	var trimRE = /^(\s+\| \|<br\/?>\|<p>( )*<\/p>)+/m;
	if (trimRE.test(html)) {
	html = html.replace(trimRE, "");
	}

	var body = document.getElementsByTagName("body")[0];
	var div = document.createElement("div");
	div.style.display = "none";
	body.appendChild(div);
	div.innerHTML = html;
	var text = div.innerText \|\| div.textContent;
	body.removeChild(div);
	var trimRE = /\S/m;
	if (!trimRE.test(text)) html = "";

	html = html.replace(/[\u2018\|\u2019\|\u201A]/g, "\'");
	html = html.replace(/[\u201C\|\u201D\|\u201E]/g, "\"");
	html = html.replace(/\u2026/g, "...");
	html = html.replace(/[\u2013\|\u2014]/g, "-");
	html = html.replace(/\u02C6/g, "^");
	html = html.replace(/\u2039/g, "");
	html = html.replace(/[\u02DC\|\u00A0]/g, " ");
	return html;
	};