Skip to content

Instantly share code, notes, and snippets.

@fakedarren
Created March 27, 2013 22:20
Show Gist options
  • Save fakedarren/5258633 to your computer and use it in GitHub Desktop.
Save fakedarren/5258633 to your computer and use it in GitHub Desktop.
Just a few regexs I wrote / stole when creating a wysiwyg text editor a few years ago. God give me strength. Related: http://stackoverflow.com/a/1732454/299237
UI.TextEditor.implement({
cleanHTML: function() {
var html = this.textarea.value;
// Remove double new lines
html = html.replace(/\n\n+/g, "\n");
// Stupid apple-style-spans etc
html = html.replace(/<br class\="webkit-block-placeholder">/gi, "<br />");
html = html.replace(/<span class="Apple-style-span">(.*)<\/span>/gi, '$1');
html = html.replace(/ class="Apple-style-span"/gi, '');
// Replace uppercase element names with lowercase
html = html.replace(/<[^> ]*/g, function(match) {
return match.toLowerCase();
});
// Replace uppercase attribute names with lowercase
html = html.replace(/<[^>]*>/g, function(match) {
match = match.replace(/ [^=]+=/g, function(match2) {
return match2.toLowerCase();
});
return match;
});
// Put quotes around unquoted attributes
html = html.replace(/<[^>]*>/g, function(match) {
match = match.replace(/( [^=]+=)([^"][^ >]*)/g, "$1\"$2\"");
return match;
});
// Convert inline styles to <strong> / <em> tags etc
//html = html.replace(/<span style="font-weight: bold;">(.*)<\/span>/gi, '<strong>$1</strong>');
//html = html.replace(/<b\b[^>]*>(.*?)<\/b[^>]*>/gi, '<strong>$1</strong>');
//html = html.replace(/<span style="font-style: italic;">(.*)<\/span>/gi, '<em>$1</em>');
//html = html.replace(/<i\b[^>]*>(.*?)<\/i[^>]*>/gi, '<em>$1</em>');
html = html.replace(/<u\b[^>]*>(.*?)<\/u[^>]*>/gi, '<span style="text-decoration: underline;">$1</span>');
// strip stupid attributes and all classes
html = html.replace(/ (border|valign|align|width|language|height|class)=\"([^\"]*)\"/gi, "");
// More complex semantics
html = html.replace(/<li>\s*<div>(.+?)<\/div><\/li>/g, '<li>$1</li>');
html = html.replace(/^([\w\s]+.*?)<div>/i, '<p>$1</p><div>');
html = html.replace(/<div>(.+?)<\/div>/ig, '<p>$1</p>');
html = html.replace(/<p>[\s\n]*(<(?:ul|ol)>.*?<\/(?:ul|ol)>)(.*?)<\/p>/ig, '$1<p>$2</p>');
html = html.replace(/<\/(ol|ul)>\s*(?!<(?:p|ol|ul|img).*?>)((?:<[^>]*>)?\w.*)$/g, '</$1><p>$2</p>');
// Any <script> tags
html = html.replace(/<script[^>]+>[\s\S]*?<\/script>/gi, "");
this.textarea.value = html;
if(/(class=\"?Mso|style=\"[^\"]*\bmso\-|w:WordDocument)/.test(html)) {
this.cleanPasteFromWord();
}
var html = this.textarea.value;
// Convert <br> to <br />
html = html.replace(/(<br>)/g, "<br />");
// Sort out leading / trailing / useless / invalid <br />s
html = html.replace(/<br ?\/?>$/gi, '');
html = html.replace(/^<br ?\/?>/gi, '');
html = html.replace(/><br ?\/?>/gi, '>');
html = html.replace(/<br ?\/?>\s*<\/(h1|h2|h3|h4|h5|h6|li|p)/gi, '</$1');
// Clean up paragraphs
html = html.replace(/<p>\s*<br ?\/?>\s*<\/p>/gi, '<p>\u00a0</p>');
html = html.replace(/<p>(&nbsp;|\s)*<\/p>/gi, '<p>\u00a0</p>');
html = html.replace(/\s*<br ?\/?>\s*<\/p>/gi, '</p>');
html = html.replace(/<p>(?:\s*)<p>/g, '<p>');
html = html.replace(/<\/p>\s*<\/p>/g, '</p>');
html = html.replace(/<p>\W*<\/p>/g, '');
html = html.replace(/<br[^>]*><\/p>/g, '</p>');
html = html.replace(/<p>\s*(<img[^>]+>)\s*<\/p>/ig, '$1\n');
// Format sourcecode
html = html.replace(/<p([^>]*)>(.*?)<\/p>(?!\n)/g, '<p$1>$2</p>\n');
html = html.replace(/<\/(ul|ol|p)>(?!\n)/g, '</$1>\n');
html = html.replace(/><li>/g, '>\n\t<li>');
html = html.replace(/([^\n])<\/(ol|ul)>/g, '$1\n</$2>');
html = html.replace(/([^\n])<img/ig, '$1\n<img');
html = html.replace(/^\s*$/g, '');
html = html.trim();
this.textarea.value = html;
},
cleanPasteFromWord: function() {
var html = this.textarea.value;
// Remove MS word comments
html = html.replace(/<!--[\s\S]+?-->/gi, "");
// Dodgy tags and VML
html = html.replace(/<\/?(img|font|meta|link|style|div|v:\w+)[^>]*>/gi, "");
// MS namespace elements
html = html.replace(/<\/?o:[^>]*>/gi, "");
// XML namespace declarations
html = html.replace(/<\\?\?xml[^>]*>/gi, "");
// on.., class, style and other attributes with and without quotes (different browsers)
html = html.replace(/ (id|name|lang|type|clear|start|language|on\w+|v:\w+|w:\w+)=\"([^\"]*)\"/gi, "");
html = html.replace(/ (id|name|lang|type|clear|start|language|on\w+|v:\w+|w:\w+)=(\w+)/gi, "");
// <s> into <strike> for strikethrough
html = html.replace(/<(\/?)s>/gi, "<$1strike>");
// Only <span> elements left should be MS ones so can remove
html = html.replace(/<span\b[^>]*>(.*?)<\/span[^>]*>/gi, '$1');
// Strip stupid mso classes
html = html.replace(/ class=\"(mso[^\"]*)\"/gi, "");
html = html.replace(/ class=(mso\w+)/gi, "");
// Eliminate all remaining style attributes
html = html.replace(/ style=\"[^\"]*\"/gi, "");
// TOC links
html = html.replace(/ href="[^#]+#_Toc[^\"]*\"/gi, "");
html = html.replace(/<a>(.*?)<\/a>/gi, "$1");
// All blank spans
html = html.replace(/<span>(.*)<\/span>/gi, "$1");
this.textarea.value = html;
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment