Last active
June 22, 2020 13:51
-
-
Save sheepeeh/7e2c71478982b49ce348 to your computer and use it in GitHub Desktop.
Get clean HTML from a template-generated Word document (also requires pandoc)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
' The final step in the process requires pandoc http://pandoc.org/ | |
' This was designed for our particular Finding Aid template, so some sections might not apply. The ConvertHeadings function, in particular, references a custom style and decrements all Heading sizes by 1. | |
' Running the prepWordForHtml macro will do the following: | |
' - Remove all Content Controls (but preserve the content) | |
' - Convert list numbering to plain text | |
' - Change Box Heading Style to Heading 3 | |
' - Change Heading 2 to Heading 3 | |
' - Change Heading 1 to Heading 2 | |
' - Delete Captions | |
' - Delete TOC Heading text | |
' - Remove all visible and hidden Bookmarks | |
' - Delete the Table of Contents | |
' | |
' Once the macro is finished, save your newly clean document as a DOCX file. | |
' Open a command prompt in the folder containing your file, and type the following command: | |
' pandoc -s YourInputFile.docx -o YourOutputFile.docx | |
' | |
' The -s flag is *very important,* especially on Windows machines. Without it, you will have all kinds of encoding problems. (arrrgh, smart quotes!) | |
' The resulting file should be clean of all the garbage Word typically includes when exporting to HTML. The only artifact is an id attribute for every heading--but I consider it a bonus! | |
Function RemoveContentControls() | |
Dim cc As ContentControl | |
If ActiveDocument.ContentControls.Count <> 0 Then | |
For Each cc In ActiveDocument.ContentControls | |
cc.Delete | |
Next | |
End If | |
End Function | |
Function ConvertNumberedListToManual() | |
ActiveDocument.Range.ListFormat.ConvertNumbersToText | |
End Function | |
Function ConvertHeadings() | |
ActiveDocument.Range.Select | |
Selection.Find.ClearFormatting | |
Selection.Find.Style = ActiveDocument.Styles("Box Heading") | |
Selection.Find.Replacement.ClearFormatting | |
Selection.Find.Replacement.Style = ActiveDocument.Styles("Heading 3") | |
With Selection.Find | |
.Text = "" | |
.Replacement.Text = "" | |
.Forward = True | |
.Wrap = wdFindContinue | |
.Format = True | |
.MatchCase = False | |
.MatchWholeWord = False | |
.MatchWildcards = False | |
.MatchSoundsLike = False | |
.MatchAllWordForms = False | |
End With | |
Selection.Find.Execute Replace:=wdReplaceAll | |
Selection.Find.ClearFormatting | |
Selection.Find.Style = ActiveDocument.Styles("Heading 2") | |
Selection.Find.Replacement.ClearFormatting | |
Selection.Find.Replacement.Style = ActiveDocument.Styles("Heading 3") | |
With Selection.Find | |
.Text = "" | |
.Replacement.Text = "" | |
.Forward = True | |
.Wrap = wdFindContinue | |
.Format = True | |
.MatchCase = False | |
.MatchWholeWord = False | |
.MatchWildcards = False | |
.MatchSoundsLike = False | |
.MatchAllWordForms = False | |
End With | |
Selection.Find.Execute Replace:=wdReplaceAll | |
Selection.Find.ClearFormatting | |
Selection.Find.Style = ActiveDocument.Styles("Heading 1") | |
Selection.Find.Replacement.ClearFormatting | |
Selection.Find.Replacement.Style = ActiveDocument.Styles("Heading 2") | |
With Selection.Find | |
.Text = "" | |
.Replacement.Text = "" | |
.Forward = True | |
.Wrap = wdFindContinue | |
.Format = True | |
.MatchCase = False | |
.MatchWholeWord = False | |
.MatchWildcards = False | |
.MatchSoundsLike = False | |
.MatchAllWordForms = False | |
End With | |
Selection.Find.Execute Replace:=wdReplaceAll | |
Selection.Find.ClearFormatting | |
Selection.Find.Style = ActiveDocument.Styles("Caption") | |
Selection.Find.Replacement.ClearFormatting | |
With Selection.Find | |
.Text = "*" | |
.Replacement.Text = "" | |
.Forward = True | |
.Wrap = wdFindContinue | |
.Format = True | |
.MatchCase = False | |
.MatchWholeWord = False | |
.MatchWildcards = True | |
.MatchSoundsLike = False | |
.MatchAllWordForms = False | |
End With | |
Selection.Find.Execute Replace:=wdReplaceAll | |
Selection.Find.ClearFormatting | |
Selection.Find.Style = ActiveDocument.Styles("TOC Heading") | |
Selection.Find.Replacement.ClearFormatting | |
With Selection.Find | |
.Text = "*" | |
.Replacement.Text = "" | |
.Forward = True | |
.Wrap = wdFindContinue | |
.Format = True | |
.MatchCase = False | |
.MatchWholeWord = False | |
.MatchWildcards = True | |
.MatchSoundsLike = False | |
.MatchAllWordForms = False | |
End With | |
Selection.Find.Execute Replace:=wdReplaceAll | |
Selection.WholeStory | |
Selection.ParagraphFormat.Alignment = wdAlignParagraphLeft | |
End Function | |
Function RemoveHeaderFooter() | |
Dim oHF As HeaderFooter | |
Dim oSection As Section | |
For Each oSection In ActiveDocument.Sections | |
For Each oFF In oSection.Headers | |
oFF.Range.Delete | |
Next | |
For Each oFF In oSection.Footers | |
oFF.Range.Delete | |
Next | |
Next | |
End Function | |
Function RemoveBookmarks() | |
Dim bkm As Bookmark | |
ActiveDocument.Bookmarks.ShowHidden = True | |
For Each bkm In ActiveDocument.Bookmarks | |
bkm.Delete | |
Next bkm | |
If ActiveDocument.TablesOfContents.Count <> 0 Then | |
ActiveDocument.TablesOfContents(1).Delete | |
End If | |
End Function | |
Sub PrepWordForHtml() | |
Dim Ret_type As Integer | |
Dim strMsg As String | |
Dim strTitle As String | |
strMsg = "This macro removes and changes formatting and content. These actions cannot be undone. Click OK to continue or Cancel to stop." | |
strTitle = "WARNING: Action cannot be undone." | |
Ret_type = MsgBox(strMsg, vbOKCancel + vbExclamation, strTitle) | |
Select Case Ret_type | |
Case 1 | |
Call RemoveContentControls | |
Call ConvertNumberedListToManual | |
Call ConvertHeadings | |
Call RemoveHeaderFooter | |
Call RemoveBookmarks | |
Case 2 | |
' Cancel | |
End Select | |
End Sub |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment