Skip to content

Instantly share code, notes, and snippets.

@bofrede
Last active February 8, 2017 16:22
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bofrede/4595889 to your computer and use it in GitHub Desktop.
Save bofrede/4595889 to your computer and use it in GitHub Desktop.
Export Word documents as HTML
=begin
This script requires a Ruby intepeter to run:
http://rubyinstaller.org/
This script also requires Microsoft Windows and Microsoft Word to be installed.
A few libraries, used by this script:
HTML Sanitizer:
https://github.com/rgrove/sanitize/
HTML parser and modifier:
http://nokogiri.org/
Tk user interface:
http://www.tkdocs.com/tutorial/windows.html#dialogs
For more information on the Word Document class see:
http://msdn.microsoft.com/en-us/library/bb244898(v=office.12).aspx
Document.saveas http://msdn.microsoft.com/en-us/library/bb221597.aspx
Document.saveas2 http://msdn.microsoft.com/en-us/library/ff836084(v=office.14).aspx
msoEncoding values http://msdn.microsoft.com/en-us/library/office/aa432511(v=office.12).aspx
=end
begin
require 'win32ole'
require 'tk'
require 'sanitize'
rescue LoadError => le
puts "LoadError: #{le.message}"
puts "Run: gem install win32ole tk sanitize"
exit
end
WD_FORMAT = {
:document => 0, # Microsoft Office Word format.
:document_97 => 0, # Microsoft Word 97 document format.
:template => 1, # Word template format.
:template_97 => 1, # Word 97 template format.
:text => 2, # Microsoft Windows text format.
:text_line_breaks => 3, # Windows text format with line breaks preserved.
:dos_text => 4, # Microsoft DOS text format.
:dos_text_line_breaks => 5, # Microsoft DOS text with line breaks preserved.
:rtf => 6, # Rich text format (RTF).
:encoded_text => 7, # Encoded text format.
:unicode_text => 7, # Unicode text format.
:html => 8, # Standard HTML format.
:web_archive => 9, # Web archive format.
:filtered_html => 10, # Filtered HTML format.
:xml => 11, # Extensible Markup Language (XML) format.
:xml_document => 12, # XML document format.
:xml_document_macro_enabled => 13, # XML document format with macros enabled.
:xml_template => 14, # XML template format.
:xml_template_macro_enabled => 15, # XML template format with macros enabled.
:document_default => 16, # Word default document file format. For Microsoft Office Word 2007, this is the DOCX format.
:pdf => 17, # PDF format.
:xps => 18 # XPS format.
} # From: http://msdn.microsoft.com/en-us/library/bb238158(v=office.12).aspx
WHITE_LIST = {
:allow_comments => true,
:remove_contents => ['script', 'style'],
:elements => %w{
html head title link meta body
h1 h2 h3 h4 h5 h6 p
dd dl dt li ol ul
caption col colgroup table tbody td tfoot th thead tr
a abbr b blockquote br cite code del dfn div em figcaption figure hgroup i img ins kbd mark
pre q rp rt ruby s samp small strike strong sub sup time var wbr
},
:attributes => {
:all => ['title', 'id', 'class'],
'html' => ['lang'],
'meta' => ['http-equiv', 'name', 'content'],
'a' => ['href', 'name'],
'blockquote' => ['cite'],
'col' => ['span', 'width'],
'colgroup' => ['span', 'width'],
'del' => ['cite', 'datetime'],
'img' => ['align', 'alt', 'height', 'src', 'width'],
'ins' => ['cite', 'datetime'],
'ol' => ['start', 'reversed', 'type'],
'q' => ['cite'],
'table' => ['border', 'summary', 'width'],
'td' => ['abbr', 'axis', 'colspan', 'rowspan', 'width'],
'th' => ['abbr', 'axis', 'colspan', 'rowspan', 'scope', 'width'],
'time' => ['datetime', 'pubdate'],
'ul' => ['type']
},
:protocols => {
'a' => {'href' => ['ftp', 'http', 'https', 'mailto', :relative]},
'blockquote' => {'cite' => ['http', 'https', :relative]},
'del' => {'cite' => ['http', 'https', :relative]},
'img' => {'src' => ['http', 'https', :relative, 'data']},
'ins' => {'cite' => ['http', 'https', :relative]},
'q' => {'cite' => ['http', 'https', :relative]}
}
}
begin
word = WIN32OLE.new('Word.Application')
word.visible = false
word_file_name = Tk::getOpenFile(:filetypes => [['Word documents','*.doc?'], ['All files', '*.*']])
if word_file_name
word_file_name.gsub!(/\//, "\\") # Forward slashes in file names with spaces cause: "OLE error code:800A1436 in Microsoft Word"
puts "Converting: #{word_file_name}"
word_document = word.documents.Open(word_file_name)
if word_document.nil?
puts ' File not found! Probably due to spaces in the file path.'
else
html_file_name = word_file_name.sub(/(.*)\..*$/, '\1_raw.html')
puts "Saving as #{html_file_name}"
word_document.saveas({'FileName' => html_file_name, 'FileFormat' => WD_FORMAT[:filtered_html], 'Encoding' => 65001}) # Encoding is ignored!
word_document.close()
# Reopen html file, using the same charset Word used to save it.
puts "Reading HTML from #{html_file_name}"
html_file = File.open(html_file_name, "r:windows-1252:utf-8")
puts "HTML file encoding #{html_file.external_encoding.name}"
html = '<!DOCTYPE html>' + html_file.read()
puts 'Sanitizing'
html_document = Nokogiri::HTML::Document.parse(html)
Sanitize.new(WHITE_LIST).clean_node!(html_document)
html_document.css('html').first['lang'] = 'en-US'
html_document.css('meta[name="Generator"]').first.remove()
# Remove page numbers from TOC
html_document.css('.MsoToc1 a, .MsoToc2 a').each do |item|
item.inner_html = item.inner_text.sub(/(\s+\d+)\Z/, '')
end
# Remove Words "normal" classes.
UNWANTED_CLASSES = %w{MsoNormal MsoBodyText NormalBold MsoHeader Templatehelp
TOCEntry Indent1 MsoCaption MsoListParagraph
MsoNormalTable MsoTableGrid MsoTableClassic1}
UNWANTED_CLASSES.each do |class_name|
html_document.css(".#{class_name}").each do |node|
node.remove_attribute('class')
end
end
# Remove abandend anchors, that are not linked to.
html_document.css('a[name]').each do |a|
if html_document.css('a[href="#' + a['name'] + '"]').size == 0
puts "<a name=\"#{a['name']}\"> was removed."
a.replace(a.inner_html)
end
end
sanitized_html = html_document.to_html({:encoding => 'UTF-8', :indent => 0})
# write output to (new) file
sanitized_html_file_name = word_file_name.sub(/(.*)\..*$/, '\1.html')
puts "Writing sanitized HTML file: #{sanitized_html_file_name}"
File.open(sanitized_html_file_name, 'w:UTF-8') do |f|
f.write sanitized_html
end
puts 'Done.'
end
end
rescue WIN32OLERuntimeError => rte
puts "Error: #{rte.message}"
ensure
word.quit() unless word.nil?
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment