Skip to content

Instantly share code, notes, and snippets.

@sheepeeh
Last active June 22, 2020 13:51
Show Gist options
  • Save sheepeeh/7e2c71478982b49ce348 to your computer and use it in GitHub Desktop.
Save sheepeeh/7e2c71478982b49ce348 to your computer and use it in GitHub Desktop.
Get clean HTML from a template-generated Word document (also requires pandoc)
' The final step in the process requires pandoc http://pandoc.org/
' This was designed for our particular Finding Aid template, so some sections might not apply. The ConvertHeadings function, in particular, references a custom style and decrements all Heading sizes by 1.
' Running the prepWordForHtml macro will do the following:
' - Remove all Content Controls (but preserve the content)
' - Convert list numbering to plain text
' - Change Box Heading Style to Heading 3
' - Change Heading 2 to Heading 3
' - Change Heading 1 to Heading 2
' - Delete Captions
' - Delete TOC Heading text
' - Remove all visible and hidden Bookmarks
' - Delete the Table of Contents
'
' Once the macro is finished, save your newly clean document as a DOCX file.
' Open a command prompt in the folder containing your file, and type the following command:
' pandoc -s YourInputFile.docx -o YourOutputFile.docx
'
' The -s flag is *very important,* especially on Windows machines. Without it, you will have all kinds of encoding problems. (arrrgh, smart quotes!)
' The resulting file should be clean of all the garbage Word typically includes when exporting to HTML. The only artifact is an id attribute for every heading--but I consider it a bonus!
Function RemoveContentControls()
Dim cc As ContentControl
If ActiveDocument.ContentControls.Count <> 0 Then
For Each cc In ActiveDocument.ContentControls
cc.Delete
Next
End If
End Function
Function ConvertNumberedListToManual()
ActiveDocument.Range.ListFormat.ConvertNumbersToText
End Function
Function ConvertHeadings()
ActiveDocument.Range.Select
Selection.Find.ClearFormatting
Selection.Find.Style = ActiveDocument.Styles("Box Heading")
Selection.Find.Replacement.ClearFormatting
Selection.Find.Replacement.Style = ActiveDocument.Styles("Heading 3")
With Selection.Find
.Text = ""
.Replacement.Text = ""
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
Selection.Find.ClearFormatting
Selection.Find.Style = ActiveDocument.Styles("Heading 2")
Selection.Find.Replacement.ClearFormatting
Selection.Find.Replacement.Style = ActiveDocument.Styles("Heading 3")
With Selection.Find
.Text = ""
.Replacement.Text = ""
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
Selection.Find.ClearFormatting
Selection.Find.Style = ActiveDocument.Styles("Heading 1")
Selection.Find.Replacement.ClearFormatting
Selection.Find.Replacement.Style = ActiveDocument.Styles("Heading 2")
With Selection.Find
.Text = ""
.Replacement.Text = ""
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = False
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
Selection.Find.ClearFormatting
Selection.Find.Style = ActiveDocument.Styles("Caption")
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "*"
.Replacement.Text = ""
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = True
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
Selection.Find.ClearFormatting
Selection.Find.Style = ActiveDocument.Styles("TOC Heading")
Selection.Find.Replacement.ClearFormatting
With Selection.Find
.Text = "*"
.Replacement.Text = ""
.Forward = True
.Wrap = wdFindContinue
.Format = True
.MatchCase = False
.MatchWholeWord = False
.MatchWildcards = True
.MatchSoundsLike = False
.MatchAllWordForms = False
End With
Selection.Find.Execute Replace:=wdReplaceAll
Selection.WholeStory
Selection.ParagraphFormat.Alignment = wdAlignParagraphLeft
End Function
Function RemoveHeaderFooter()
Dim oHF As HeaderFooter
Dim oSection As Section
For Each oSection In ActiveDocument.Sections
For Each oFF In oSection.Headers
oFF.Range.Delete
Next
For Each oFF In oSection.Footers
oFF.Range.Delete
Next
Next
End Function
Function RemoveBookmarks()
Dim bkm As Bookmark
ActiveDocument.Bookmarks.ShowHidden = True
For Each bkm In ActiveDocument.Bookmarks
bkm.Delete
Next bkm
If ActiveDocument.TablesOfContents.Count <> 0 Then
ActiveDocument.TablesOfContents(1).Delete
End If
End Function
Sub PrepWordForHtml()
Dim Ret_type As Integer
Dim strMsg As String
Dim strTitle As String
strMsg = "This macro removes and changes formatting and content. These actions cannot be undone. Click OK to continue or Cancel to stop."
strTitle = "WARNING: Action cannot be undone."
Ret_type = MsgBox(strMsg, vbOKCancel + vbExclamation, strTitle)
Select Case Ret_type
Case 1
Call RemoveContentControls
Call ConvertNumberedListToManual
Call ConvertHeadings
Call RemoveHeaderFooter
Call RemoveBookmarks
Case 2
' Cancel
End Select
End Sub
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment