Anonymous (owner)

Forks

Revisions

  • 4c134f Fri Jul 03 00:24:39 -0700 2009
  • e976a4 Fri Jul 03 00:23:28 -0700 2009
gist: 139987 Download_button fork
public
Description:
A ruby snippet for Parsing and cleaning Word HTML
Public Clone URL: git://gist.github.com/139987.git
Embed All Files: show embed
Ruby #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#
# This function takes messy Word HTML pasted into a WYSIWYG and cleans it up
# It leaves the tags and attributes specified in the params
# Copyright (c) 2009, Radio New Zealand
# Released under the MIT license
 
require 'rubygems'
require 'sanitize'
 
def clean_up_word_html(html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={})
 
  email_regex = /<p>Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i
 
  html.gsub! /[\n|\r]/ , ''
 
  # keep only the things we want.
  html = Sanitize.clean( html, :elements => elements, :attributes => attributes )
 
  # butt up any tags
  html.gsub! /&nbsp;/ , ' '
  html.gsub! />\s+</ , '><'
 
  #remove email address lines
  html.gsub! email_regex , '<p>'
 
  # post sanitize cleanup of empty blocks
  # the order of removal is import - this is the way word stacks these elements
  html.gsub! /<i><\/i>/ , ''
  html.gsub! /<b><\/b>/ , ''
  html.gsub! /<\/b><b>/ , ''
  html.gsub! /<p><\/p>/ , ''
  html.gsub! /<p><b><\/b><\/p>/ , ''
 
  # misc - fix butted times
  html.gsub! /(\d)am / , '\1 am '
  html.gsub! /(\d)pm / , '\1 pm '
  # misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
  html.gsub! /\s+/ , ' '
 
  # add new lines at the end of lines
  html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '</\1>' + "\n"
  html.gsub! /<dl>/ , '<dl>' + "\n"
 
  html
end