Skip to content

Instantly share code, notes, and snippets.

@edwintorok
Created December 18, 2023 00:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edwintorok/27b90e6f5f8f3b3e9f89372f05df1b6c to your computer and use it in GitHub Desktop.
Save edwintorok/27b90e6f5f8f3b3e9f89372f05df1b6c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from docx import Document
from docx.oxml import ns
from docx.oxml.table import CT_TblWidth
from docx.oxml.shared import OxmlElement
from docx.oxml.ns import qn
# From python-docx _tag_seq
tag_seqs = (
(
"w:footnotePr",
"w:endnotePr",
"w:type",
"w:pgSz",
"w:pgMar",
"w:paperSrc",
"w:pgBorders",
"w:lnNumType",
"w:pgNumType",
"w:cols",
"w:formProt",
"w:vAlign",
"w:noEndnote",
"w:titlePg",
"w:textDirection",
"w:bidi",
"w:rtlGutter",
"w:docGrid",
"w:printerSettings",
"w:sectPrChange",
),
(
"w:writeProtection",
"w:view",
"w:zoom",
"w:removePersonalInformation",
"w:removeDateAndTime",
"w:doNotDisplayPageBoundaries",
"w:displayBackgroundShape",
"w:printPostScriptOverText",
"w:printFractionalCharacterWidth",
"w:printFormsData",
"w:embedTrueTypeFonts",
"w:embedSystemFonts",
"w:saveSubsetFonts",
"w:saveFormsData",
"w:mirrorMargins",
"w:alignBordersAndEdges",
"w:bordersDoNotSurroundHeader",
"w:bordersDoNotSurroundFooter",
"w:gutterAtTop",
"w:hideSpellingErrors",
"w:hideGrammaticalErrors",
"w:activeWritingStyle",
"w:proofState",
"w:formsDesign",
"w:attachedTemplate",
"w:linkStyles",
"w:stylePaneFormatFilter",
"w:stylePaneSortMethod",
"w:documentType",
"w:mailMerge",
"w:revisionView",
"w:trackRevisions",
"w:doNotTrackMoves",
"w:doNotTrackFormatting",
"w:documentProtection",
"w:autoFormatOverride",
"w:styleLockTheme",
"w:styleLockQFSet",
"w:defaultTabStop",
"w:autoHyphenation",
"w:consecutiveHyphenLimit",
"w:hyphenationZone",
"w:doNotHyphenateCaps",
"w:showEnvelope",
"w:summaryLength",
"w:clickAndTypeStyle",
"w:defaultTableStyle",
"w:evenAndOddHeaders",
"w:bookFoldRevPrinting",
"w:bookFoldPrinting",
"w:bookFoldPrintingSheets",
"w:drawingGridHorizontalSpacing",
"w:drawingGridVerticalSpacing",
"w:displayHorizontalDrawingGridEvery",
"w:displayVerticalDrawingGridEvery",
"w:doNotUseMarginsForDrawingGridOrigin",
"w:drawingGridHorizontalOrigin",
"w:drawingGridVerticalOrigin",
"w:doNotShadeFormData",
"w:noPunctuationKerning",
"w:characterSpacingControl",
"w:printTwoOnOne",
"w:strictFirstAndLastChars",
"w:noLineBreaksAfter",
"w:noLineBreaksBefore",
"w:savePreviewPicture",
"w:doNotValidateAgainstSchema",
"w:saveInvalidXml",
"w:ignoreMixedContent",
"w:alwaysShowPlaceholderText",
"w:doNotDemarcateInvalidXml",
"w:saveXmlDataOnly",
"w:useXSLTWhenSaving",
"w:saveThroughXslt",
"w:showXMLTags",
"w:alwaysMergeEmptyNamespace",
"w:updateFields",
"w:hdrShapeDefaults",
"w:footnotePr",
"w:endnotePr",
"w:compat",
"w:docVars",
"w:rsids",
"m:mathPr",
"w:attachedSchema",
"w:themeFontLang",
"w:clrSchemeMapping",
"w:doNotIncludeSubdocsInStats",
"w:doNotAutoCompressPictures",
"w:forceUpgrade",
"w:captions",
"w:readModeInkLockDown",
"w:smartTagType",
"sl:schemaLibrary",
"w:shapeDefaults",
"w:doNotEmbedSmartTags",
"w:decimalSymbol",
"w:listSeparator",
),
(
"w:name",
"w:aliases",
"w:basedOn",
"w:next",
"w:link",
"w:autoRedefine",
"w:hidden",
"w:uiPriority",
"w:semiHidden",
"w:unhideWhenUsed",
"w:qFormat",
"w:locked",
"w:personal",
"w:personalCompose",
"w:personalReply",
"w:rsid",
"w:pPr",
"w:rPr",
"w:tblPr",
"w:trPr",
"w:tcPr",
"w:tblStylePr",
),
("w:docDefaults", "w:latentStyles", "w:style"),
(
"w:tblStyle",
"w:tblpPr",
"w:tblOverlap",
"w:bidiVisual",
"w:tblStyleRowBandSize",
"w:tblStyleColBandSize",
"w:tblW",
"w:jc",
"w:tblCellSpacing",
"w:tblInd",
"w:tblBorders",
"w:shd",
"w:tblLayout",
"w:tblCellMar",
"w:tblLook",
"w:tblCaption",
"w:tblDescription",
"w:tblPrChange",
),
(
"w:cnfStyle",
"w:tcW",
"w:gridSpan",
"w:hMerge",
"w:vMerge",
"w:tcBorders",
"w:shd",
"w:noWrap",
"w:tcMar",
"w:textDirection",
"w:tcFitText",
"w:vAlign",
"w:hideMark",
"w:headers",
"w:cellIns",
"w:cellDel",
"w:cellMerge",
"w:tcPrChange",
"w:cnfStyle",
"w:divId",
"w:gridBefore",
"w:gridAfter",
"w:wBefore",
"w:wAfter",
"w:cantSplit",
"w:trHeight",
"w:tblHeader",
"w:tblCellSpacing",
"w:jc",
"w:hidden",
"w:ins",
"w:del",
"w:trPrChange",
),
(
"w:rStyle",
"w:rFonts",
"w:b",
"w:bCs",
"w:i",
"w:iCs",
"w:caps",
"w:smallCaps",
"w:strike",
"w:dstrike",
"w:outline",
"w:shadow",
"w:emboss",
"w:imprint",
"w:noProof",
"w:snapToGrid",
"w:vanish",
"w:webHidden",
"w:color",
"w:spacing",
"w:w",
"w:kern",
"w:position",
"w:sz",
"w:szCs",
"w:highlight",
"w:u",
"w:effect",
"w:bdr",
"w:shd",
"w:fitText",
"w:vertAlign",
"w:rtl",
"w:cs",
"w:em",
"w:lang",
"w:eastAsianLayout",
"w:specVanish",
"w:oMath",
),
(
"w:pStyle",
"w:keepNext",
"w:keepLines",
"w:pageBreakBefore",
"w:framePr",
"w:widowControl",
"w:numPr",
"w:suppressLineNumbers",
"w:pBdr",
"w:shd",
"w:tabs",
"w:suppressAutoHyphens",
"w:kinsoku",
"w:wordWrap",
"w:overflowPunct",
"w:topLinePunct",
"w:autoSpaceDE",
"w:autoSpaceDN",
"w:bidi",
"w:adjustRightInd",
"w:snapToGrid",
"w:spacing",
"w:ind",
"w:contextualSpacing",
"w:mirrorIndents",
"w:suppressOverlap",
"w:jc",
"w:textDirection",
"w:textAlignment",
"w:textboxTightWrap",
"w:outlineLvl",
"w:divId",
"w:cnfStyle",
"w:rPr",
"w:sectPr",
"w:pPrChange",
),
(
"w:top",
"w:start",
"w:left",
"w:bottom",
"w:end",
"w:right",
"w:insideH",
"w:insideV",
),
)
settings_tag_seq = (
"w:writeProtection",
"w:view",
"w:zoom",
"w:removePersonalInformation",
"w:removeDateAndTime",
"w:doNotDisplayPageBoundaries",
"w:displayBackgroundShape",
"w:printPostScriptOverText",
"w:printFractionalCharacterWidth",
"w:printFormsData",
"w:embedTrueTypeFonts",
"w:embedSystemFonts",
"w:saveSubsetFonts",
"w:saveFormsData",
"w:mirrorMargins",
"w:alignBordersAndEdges",
"w:bordersDoNotSurroundHeader",
"w:bordersDoNotSurroundFooter",
"w:gutterAtTop",
"w:hideSpellingErrors",
"w:hideGrammaticalErrors",
"w:activeWritingStyle",
"w:unbounded",
"w:proofState",
"w:formsDesign",
"w:attachedTemplate",
"w:linkStyles",
"w:stylePaneFormatFilter",
"w:stylePaneSortMethod",
"w:documentType",
"w:mailMerge",
"w:revisionView",
"w:trackRevisions",
"w:doNotTrackMoves",
"w:doNotTrackFormatting",
"w:documentProtection",
"w:autoFormatOverride",
"w:styleLockTheme",
"w:styleLockQFSet",
"w:defaultTabStop",
"w:autoHyphenation",
"w:consecutiveHyphenLimit",
"w:hyphenationZone",
"w:doNotHyphenateCaps",
"w:showEnvelope",
"w:summaryLength",
"w:clickAndTypeStyle",
"w:defaultTableStyle",
"w:evenAndOddHeaders",
"w:bookFoldRevPrinting",
"w:bookFoldPrinting",
"w:bookFoldPrintingSheets",
"w:drawingGridHorizontalSpacing",
"w:drawingGridVerticalSpacing",
"w:displayHorizontalDrawingGridEvery",
"w:displayVerticalDrawingGridEvery",
"w:doNotUseMarginsForDrawingGridOrigin",
"w:drawingGridHorizontalOrigin",
"w:drawingGridVerticalOrigin",
"w:doNotShadeFormData",
"w:noPunctuationKerning",
"w:characterSpacingControl",
"w:printTwoOnOne",
"w:strictFirstAndLastChars",
"w:noLineBreaksAfter",
"w:noLineBreaksBefore",
"w:savePreviewPicture",
"w:doNotValidateAgainstSchema",
"w:saveInvalidXml",
"w:ignoreMixedContent",
"w:alwaysShowPlaceholderText",
"w:doNotDemarcateInvalidXml",
"w:saveXmlDataOnly",
"w:useXSLTWhenSaving",
"w:saveThroughXslt",
"w:showXMLTags",
"w:alwaysMergeEmptyNamespace",
"w:updateFields",
"w:hdrShapeDefaults",
"w:footnotePr",
"w:endnotePr",
"w:compat",
"w:docVars",
"w:rsids",
"m:mathPr",
"w:attachedSchema",
"w:themeFontLang",
"w:clrSchemeMapping",
"w:doNotIncludeSubdocsInStats",
"w:doNotAutoCompressPictures",
"w:forceUpgrade",
"w:captions",
"w:readModeInkLockDown",
"w:smartTagType",
"sl:schemaLibrary",
"w:shapeDefaults",
"w:doNotEmbedSmartTags",
"w:decimalSymbol",
"w:listSeparator",
)
settings_tag_seq = {qn(s): i for (i, s) in enumerate(settings_tag_seq)}
tag_seqs = [{qn(s): i for (i, s) in enumerate(tag_seq)} for tag_seq in tag_seqs]
# Load reference document
import sys
document = Document(sys.argv[1])
styles = document.styles
# Fix validation error: duplicate style
for p in document.paragraphs:
if p.text.strip() == "Abstract":
p.style = None
p.style = styles["Abstract"]
# Fix validation error: character content in style '>'
atitle = styles["Abstract Title"]
rPr = atitle.element.rPr
for x in rPr:
x.tail = None
# Fix validation error: 0.0 instead of 0
for t in document.tables:
wtag = ns.qn("w:tblW")
tblW = t._tblPr.find(wtag)
if tblW is not None:
tblW = CT_TblWidth(tblW)
tblW.w = 0
for s in styles:
# there might be some asciiTheme attributes that prevent style inheritance from working
# explicitly wipe font name to make inheritance work
if s.font.name is None:
s.font.name = None
def sort_by_tag_seq(el) -> None:
tags = frozenset(x.tag for x in el)
for tag_seq in tag_seqs:
tag_seq_set = frozenset(tag_seq.keys())
if tags.issubset(tag_seq_set):
el[:] = sorted(el, key=lambda x: tag_seq[x.tag])
return
settings_set = frozenset(settings_tag_seq.keys())
for el in document.settings._element.iter():
tags = frozenset(x.tag for x in el)
if tags.issubset(settings_set):
el[:] = sorted(el, key=lambda x: settings_tag_seq[x.tag])
# TODO: upstream
for w_nsid in document.part.numbering_part._element.xpath("./w:abstractNum/w:nsid"):
old = w_nsid.get(qn("w:val"))
if len(old) < 8:
w_nsid.set(qn("w:val"), "0" * (8 - len(old)) + old)
# quarto w:pPr duplicate, todo fix
for wp in document._element.xpath("./w:body/w:tbl/w:tr/w:tc/w:p"):
all = wp.xpath("./w:pPr")
first = all[0]
for other in all[1:]:
for child in other[:]:
first.append(child)
wp.remove(other)
# remove duplicate w:jc
for dup in first.xpath("./w:jc")[1:]:
first.remove(dup)
# TODO: add missing tblGrid in quarto
for tbl in document._element.xpath("./w:body/w:tbl"):
if not tbl.xpath("./w:tblGrid"):
tblGrid = OxmlElement("w:tblGrid")
tbl.insert_element_before(tblGrid, "w:tr")
gridCol = OxmlElement("w:gridCol")
tblGrid.append(gridCol)
gridCol.set(qn("w:w"), "7920")
# TODO: quarto or pandoc bug? most likely quarto...
for el in document._element.iter():
sort_by_tag_seq(el)
for el in document.styles._element.iter():
sort_by_tag_seq(el)
document.save(sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment