Skip to content

Instantly share code, notes, and snippets.

@dlukes
Last active May 7, 2019 15:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dlukes/2b5c2a163cd8adba420aaae0c8ea2c00 to your computer and use it in GitHub Desktop.
Save dlukes/2b5c2a163cd8adba420aaae0c8ea2c00 to your computer and use it in GitHub Desktop.
Remove dates from comments and tracked edits in docx. Also, a cheatsheet for namespaces in lxml.
#!/usr/bin/env python3
"""Usage: {} AUTHOR_SUBSTRING INPUT.DOCX OUTPUT.DOCX
Remove date metadata from Word document for authors matching
AUTHOR_SUBSTRING. Handy if you don't want other people to know when
exactly you found time to work on their document ;)
In more detail: Read INPUT.DOCX, extract the comments and tracked edits,
manipulate them (cf. functions `modify_comments()` and
`modify_tracked_edits()` -- by default, they remove date metadata
when the author contains AUTHOR_SUBSTRING, but you can tweak them
depending on your needs), then create OUTPUT.DOCX with the modified
comments and tracked edits.
Note that OUTPUT.DOCX tends to be larger than the input file for some
reason. To get back to a reasonable size, just re-save OUTPUT.DOCX from
Word.
"""
import sys
import zipfile
from itertools import chain
from lxml import etree
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
NSMAP = {"w": W_NS}
W = "{" + W_NS + "}"
def modify_comments(comments, author_substr):
et = etree.fromstring(comments)
comments = et.xpath("//w:comment", namespaces=NSMAP)
for c in comments:
if author_substr in c.attrib[W + "author"]:
del c.attrib[W + "date"]
# NOTE: basically do whatever you want here with the comment
# based on its XML attributes and content
# author = c.attrib[W + "author"]
# date = c.attrib[W + "date"]
# print(f"{author} commented on {date}:")
# print(c.xpath("string(.)"))
# NOTE: if you want to completely delete the comment, you can't
# just do `et.remove(c)`, because there's also a comment marker
# within the document itself, so that's more of a hassle
return etree.tostring(et)
def modify_tracked_edits(document, author_substr):
et = etree.fromstring(document)
tracked_edits = chain(
et.xpath("//w:ins", namespaces=NSMAP), et.xpath("//w:del", namespaces=NSMAP)
)
for te in tracked_edits:
# NOTE: the guidelines are the same as in modify_comments above
if author_substr in te.attrib[W + "author"]:
del te.attrib[W + "date"]
return etree.tostring(et)
def main():
author_substr, in_fname, out_fname = sys.argv[1:]
comments_path = "word/comments.xml"
document_path = "word/document.xml"
# read in existing comments
with zipfile.ZipFile(in_fname) as docx_in:
comments = docx_in.read(comments_path)
document = docx_in.read(document_path)
comments = modify_comments(comments, author_substr)
document = modify_tracked_edits(document, author_substr)
# recreate zip archive with new version of comments
with zipfile.ZipFile(in_fname) as docx_in:
with zipfile.ZipFile(out_fname, "w") as docx_out:
# NOTE: this is some kind of ZIP file thing, NOT Word
# comments...
docx_out.comment = docx_in.comment
for item in docx_in.infolist():
# ... the old version of these gets ignored here...
if item.filename not in (comments_path, document_path):
docx_out.writestr(item, docx_in.read(item))
# ... and the updated versions get added here:
docx_out.writestr(comments_path, comments)
docx_out.writestr(document_path, document)
if __name__ == "__main__":
main()
@leyiwang
Copy link

leyiwang commented Mar 7, 2019

hi, do you know some solution to add a new comment in the docx format file?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment