fakedarren/regex.js

## regex.js
UI.TextEditor.implement({

	cleanHTML: function()	{

		var html = this.textarea.value;

		// Remove double new lines
		html = html.replace(/\n\n+/g, "\n");

		// Stupid apple-style-spans etc
		html = html.replace(/<br class\="webkit-block-placeholder">/gi, "<br />");
		html = html.replace(/<span class="Apple-style-span">(.*)<\/span>/gi, '$1');
		html = html.replace(/ class="Apple-style-span"/gi, '');

		// Replace uppercase element names with lowercase
		html = html.replace(/<[^> ]*/g, function(match)	{
			return match.toLowerCase();
		});
		// Replace uppercase attribute names with lowercase
		html = html.replace(/<[^>]*>/g, function(match)	{
			match = match.replace(/ [^=]+=/g, function(match2)	{
				return match2.toLowerCase();
			});
			return match;
		});
		// Put quotes around unquoted attributes
		html = html.replace(/<[^>]*>/g, function(match)	{
			match = match.replace(/( [^=]+=)([^"][^ >]*)/g, "$1\"$2\"");
			return match;
		});

		// Convert inline styles to <strong> / <em> tags etc
		//html = html.replace(/<span style="font-weight: bold;">(.*)<\/span>/gi, '<strong>$1</strong>');
		//html = html.replace(/<b\b[^>]*>(.*?)<\/b[^>]*>/gi, '<strong>$1</strong>');
		//html = html.replace(/<span style="font-style: italic;">(.*)<\/span>/gi, '<em>$1</em>');
		//html = html.replace(/<i\b[^>]*>(.*?)<\/i[^>]*>/gi, '<em>$1</em>');
		html = html.replace(/<u\b[^>]*>(.*?)<\/u[^>]*>/gi, '<span style="text-decoration: underline;">$1</span>');

		// strip stupid attributes and all classes
		html = html.replace(/ (border|valign|align|width|language|height|class)=\"([^\"]*)\"/gi, "");

		// More complex semantics
		html = html.replace(/<li>\s*<div>(.+?)<\/div><\/li>/g, '<li>$1</li>');
		html = html.replace(/^([\w\s]+.*?)<div>/i, '<p>$1</p><div>');
		html = html.replace(/<div>(.+?)<\/div>/ig, '<p>$1</p>');
		html = html.replace(/<p>[\s\n]*(<(?:ul|ol)>.*?<\/(?:ul|ol)>)(.*?)<\/p>/ig, '$1<p>$2</p>');
		html = html.replace(/<\/(ol|ul)>\s*(?!<(?:p|ol|ul|img).*?>)((?:<[^>]*>)?\w.*)$/g, '</$1><p>$2</p>');

		// Any <script> tags
		html = html.replace(/<script[^>]+>[\s\S]*?<\/script>/gi, "");

		this.textarea.value = html;

		if(/(class=\"?Mso|style=\"[^\"]*\bmso\-|w:WordDocument)/.test(html))	{
			this.cleanPasteFromWord();
		}

		var html = this.textarea.value;

		// Convert <br> to <br />
		html = html.replace(/(<br>)/g, "<br />");

		// Sort out leading / trailing / useless / invalid <br />s
		html = html.replace(/<br ?\/?>$/gi, '');
		html = html.replace(/^<br ?\/?>/gi, '');
		html = html.replace(/><br ?\/?>/gi, '>');
		html = html.replace(/<br ?\/?>\s*<\/(h1|h2|h3|h4|h5|h6|li|p)/gi, '</$1');

		// Clean up paragraphs
		html = html.replace(/<p>\s*<br ?\/?>\s*<\/p>/gi, '<p>\u00a0</p>');
		html = html.replace(/<p>(&nbsp;|\s)*<\/p>/gi, '<p>\u00a0</p>');
		html = html.replace(/\s*<br ?\/?>\s*<\/p>/gi, '</p>');
		html = html.replace(/<p>(?:\s*)<p>/g, '<p>');
		html = html.replace(/<\/p>\s*<\/p>/g, '</p>');
		html = html.replace(/<p>\W*<\/p>/g, '');
		html = html.replace(/<br[^>]*><\/p>/g, '</p>');
		html = html.replace(/<p>\s*(<img[^>]+>)\s*<\/p>/ig, '$1\n');

		// Format sourcecode
		html = html.replace(/<p([^>]*)>(.*?)<\/p>(?!\n)/g, '<p$1>$2</p>\n');
		html = html.replace(/<\/(ul|ol|p)>(?!\n)/g, '</$1>\n');
		html = html.replace(/><li>/g, '>\n\t<li>');
		html = html.replace(/([^\n])<\/(ol|ul)>/g, '$1\n</$2>');
		html = html.replace(/([^\n])<img/ig, '$1\n<img');
		html = html.replace(/^\s*$/g, '');
		html = html.trim();

		this.textarea.value = html;

	},

	cleanPasteFromWord: function()	{

		var html = this.textarea.value;

		// Remove MS word comments
		html = html.replace(/<!--[\s\S]+?-->/gi, "");
		// Dodgy tags and VML
		html = html.replace(/<\/?(img|font|meta|link|style|div|v:\w+)[^>]*>/gi, "");
		// MS namespace elements
		html = html.replace(/<\/?o:[^>]*>/gi, "");
		// XML namespace declarations
		html = html.replace(/<\\?\?xml[^>]*>/gi, "");
		// on.., class, style and other attributes with and without quotes (different browsers)
		html = html.replace(/ (id|name|lang|type|clear|start|language|on\w+|v:\w+|w:\w+)=\"([^\"]*)\"/gi, "");
		html = html.replace(/ (id|name|lang|type|clear|start|language|on\w+|v:\w+|w:\w+)=(\w+)/gi, "");
		// <s> into <strike> for strikethrough
		html = html.replace(/<(\/?)s>/gi, "<$1strike>");
		// Only <span> elements left should be MS ones so can remove
		html = html.replace(/<span\b[^>]*>(.*?)<\/span[^>]*>/gi, '$1');
		// Strip stupid mso classes
		html = html.replace(/ class=\"(mso[^\"]*)\"/gi,	"");
		html = html.replace(/ class=(mso\w+)/gi, "");
		// Eliminate all remaining style attributes
		html = html.replace(/ style=\"[^\"]*\"/gi, "");
		// TOC links
		html = html.replace(/ href="[^#]+#_Toc[^\"]*\"/gi, "");
		html = html.replace(/<a>(.*?)<\/a>/gi, "$1");
		// All blank spans
		html = html.replace(/<span>(.*)<\/span>/gi, "$1");

		this.textarea.value = html;

	}

});
	UI.TextEditor.implement({

	cleanHTML: function() {

	var html = this.textarea.value;

	// Remove double new lines
	html = html.replace(/\n\n+/g, "\n");

	// Stupid apple-style-spans etc
	html = html.replace(/<br class\="webkit-block-placeholder">/gi, "<br />");
	html = html.replace(/<span class="Apple-style-span">(.*)<\/span>/gi, '$1');
	html = html.replace(/ class="Apple-style-span"/gi, '');

	// Replace uppercase element names with lowercase
	html = html.replace(/<[^> ]*/g, function(match) {
	return match.toLowerCase();
	});
	// Replace uppercase attribute names with lowercase
	html = html.replace(/<[^>]*>/g, function(match) {
	match = match.replace(/ [^=]+=/g, function(match2) {
	return match2.toLowerCase();
	});
	return match;
	});
	// Put quotes around unquoted attributes
	html = html.replace(/<[^>]*>/g, function(match) {
	match = match.replace(/( [^=]+=)([^"][^ >]*)/g, "$1\"$2\"");
	return match;
	});

	// Convert inline styles to <strong> / <em> tags etc
	//html = html.replace(/<span style="font-weight: bold;">(.*)<\/span>/gi, '<strong>$1</strong>');
	//html = html.replace(/<b\b[^>]>(.?)<\/b[^>]*>/gi, '<strong>$1</strong>');
	//html = html.replace(/<span style="font-style: italic;">(.*)<\/span>/gi, '<em>$1</em>');
	//html = html.replace(/<i\b[^>]>(.?)<\/i[^>]*>/gi, '<em>$1</em>');
	html = html.replace(/<u\b[^>]>(.?)<\/u[^>]*>/gi, '<span style="text-decoration: underline;">$1</span>');

	// strip stupid attributes and all classes
	html = html.replace(/ (border\|valign\|align\|width\|language\|height\|class)=\"([^\"]*)\"/gi, "");

	// More complex semantics
	html = html.replace(/<li>\s*<div>(.+?)<\/div><\/li>/g, '<li>$1</li>');
	html = html.replace(/^([\w\s]+.*?)<div>/i, '<p>$1</p><div>');
	html = html.replace(/<div>(.+?)<\/div>/ig, '<p>$1</p>');
	html = html.replace(/<p>[\s\n](<(?:ul\|ol)>.?<\/(?:ul\|ol)>)(.*?)<\/p>/ig, '$1<p>$2</p>');
	html = html.replace(/<\/(ol\|ul)>\s(?!<(?:p\|ol\|ul\|img).?>)((?:<[^>]>)?\w.)$/g, '</$1><p>$2</p>');

	// Any <script> tags
	html = html.replace(/<script[^>]+>[\s\S]*?<\/script>/gi, "");

	this.textarea.value = html;

	if(/(class=\"?Mso\|style=\"[^\"]*\bmso\-\|w:WordDocument)/.test(html)) {
	this.cleanPasteFromWord();
	}

	var html = this.textarea.value;

	// Convert <br> to <br />
	html = html.replace(/(<br>)/g, "<br />");

	// Sort out leading / trailing / useless / invalid <br />s
	html = html.replace(/<br ?\/?>$/gi, '');
	html = html.replace(/^<br ?\/?>/gi, '');
	html = html.replace(/><br ?\/?>/gi, '>');
	html = html.replace(/<br ?\/?>\s*<\/(h1\|h2\|h3\|h4\|h5\|h6\|li\|p)/gi, '</$1');

	// Clean up paragraphs
	html = html.replace(/<p>\s<br ?\/?>\s<\/p>/gi, '<p>\u00a0</p>');
	html = html.replace(/<p>( \|\s)*<\/p>/gi, '<p>\u00a0</p>');
	html = html.replace(/\s<br ?\/?>\s<\/p>/gi, '</p>');
	html = html.replace(/<p>(?:\s*)<p>/g, '<p>');
	html = html.replace(/<\/p>\s*<\/p>/g, '</p>');
	html = html.replace(/<p>\W*<\/p>/g, '');
	html = html.replace(/<br[^>]*><\/p>/g, '</p>');
	html = html.replace(/<p>\s(<img[^>]+>)\s<\/p>/ig, '$1\n');

	// Format sourcecode
	html = html.replace(/<p([^>])>(.?)<\/p>(?!\n)/g, '<p$1>$2</p>\n');
	html = html.replace(/<\/(ul\|ol\|p)>(?!\n)/g, '</$1>\n');
	html = html.replace(/><li>/g, '>\n\t<li>');
	html = html.replace(/([^\n])<\/(ol\|ul)>/g, '$1\n</$2>');
	html = html.replace(/([^\n])<img/ig, '$1\n<img');
	html = html.replace(/^\s*$/g, '');
	html = html.trim();

	this.textarea.value = html;

	},

	cleanPasteFromWord: function() {

	var html = this.textarea.value;

	// Remove MS word comments
	html = html.replace(/<!--[\s\S]+?-->/gi, "");
	// Dodgy tags and VML
	html = html.replace(/<\/?(img\|font\|meta\|link\|style\|div\|v:\w+)[^>]*>/gi, "");
	// MS namespace elements
	html = html.replace(/<\/?o:[^>]*>/gi, "");
	// XML namespace declarations
	html = html.replace(/<\\?\?xml[^>]*>/gi, "");
	// on.., class, style and other attributes with and without quotes (different browsers)
	html = html.replace(/ (id\|name\|lang\|type\|clear\|start\|language\|on\w+\|v:\w+\|w:\w+)=\"([^\"]*)\"/gi, "");
	html = html.replace(/ (id\|name\|lang\|type\|clear\|start\|language\|on\w+\|v:\w+\|w:\w+)=(\w+)/gi, "");
	// <s> into <strike> for strikethrough
	html = html.replace(/<(\/?)s>/gi, "<$1strike>");
	// Only <span> elements left should be MS ones so can remove
	html = html.replace(/<span\b[^>]>(.?)<\/span[^>]*>/gi, '$1');
	// Strip stupid mso classes
	html = html.replace(/ class=\"(mso[^\"]*)\"/gi, "");
	html = html.replace(/ class=(mso\w+)/gi, "");
	// Eliminate all remaining style attributes
	html = html.replace(/ style=\"[^\"]*\"/gi, "");
	// TOC links
	html = html.replace(/ href="[^#]+#_Toc[^\"]*\"/gi, "");
	html = html.replace(/<a>(.*?)<\/a>/gi, "$1");
	// All blank spans
	html = html.replace(/<span>(.*)<\/span>/gi, "$1");

	this.textarea.value = html;

	}

	});