Created
July 21, 2020 18:32
-
-
Save israel-dryer/917e67e23c1ecde4f964dddb65fcafbe to your computer and use it in GitHub Desktop.
Quick and direct search for term inside docx file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A quick and dirty search within the contents of a docx file | |
Author: Israel Dryer | |
Modified: 2020-07-21 | |
""" | |
from zipfile import ZipFile | |
# file path | |
doc_path = 'senior_paper.docx' | |
# the term you want to search for | |
term1 = 'portfolio' | |
term2 = 'administration' | |
# unzip the contents of the docx file (a docx is a compressed collection of file) | |
zip_file = ZipFile(doc_path) | |
zip_list = {name: zip_file.read(name) for name in zip_file.namelist()} | |
docx = zip_list['word/document.xml'] | |
# look for the search term in the document.xml file contents | |
# the search term must be converted to bytes to match the xml content | |
check1 = bytes(term1, encoding='utf-8') in docx | |
check2 = bytes(term2, encoding='utf-8') in docx | |
# print results | |
print('\nSEARCH RESULTS\n') | |
print(f'Search term {term1} in {doc_path}: {check1}') | |
print(f'Search term {term2} in {doc_path}: {check2}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment