Last active
July 25, 2022 14:28
-
-
Save dexeonify/c828538f3c76ee936fffbe8127213917 to your computer and use it in GitHub Desktop.
A glorified script to recursively search a keyword through directories of Microsoft Word .docx files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Search through Word documents and print the related matches.""" | |
from colorama import init, Fore, Style | |
from pathlib import Path | |
import re | |
# This is a hacky workaround to enable autocompletion for python-docx | |
# See here: https://stackoverflow.com/q/42968369/16689935 | |
from docx.document import Document | |
# Do note that the above `Document` class is only used for auto-complete. | |
# The actual `Document` method is going to be imported on runtime, | |
# see the following try-except block: | |
try: | |
document = Document() | |
except TypeError: | |
# This is the actual `Document` method that's going to be used. | |
# Upon erroring (which it inevitably will), this import will override the | |
# previous `Document` import so the code will function. | |
from docx import Document | |
def get_fulltext(path): | |
""" | |
Extract text from Word documents and return the results in a long string. | |
""" | |
doc = Document(path) | |
textlist = [] | |
for paragraph in doc.paragraphs: | |
textlist.append(paragraph.text) | |
# Extract text from tables if available | |
try: | |
table = doc.tables[0] | |
for row in table.rows: | |
for cell in row.cells: | |
textlist.append(cell.text) | |
except IndexError: | |
pass | |
return " ".join(textlist) | |
def find(text, search): | |
""" | |
Find the keyword you want using regex. Additionally, include 3 words before | |
and after the keyword for context. Return the matches in a list. | |
""" | |
# Escape spaces with a backslash since spaces are ignored in verbose mode. | |
escaped = search.replace(" ", "\\ ") | |
# Regex to preview the previous and next 3 words. | |
# Regex101: https://regex101.com/r/AbASbx/2 | |
# Source: https://stackoverflow.com/a/55492108/16689935 | |
search_regex = re.compile( | |
rf"""( | |
((?:\w+\W*){{0,3}}) # Show 3 words before the match | |
({escaped}) # The actual match | |
((?:\W*\w+){{0,3}}) # Show 3 words after the match | |
)""", | |
re.VERBOSE | re.IGNORECASE, | |
) | |
matches = search_regex.findall(text) | |
return matches | |
def display_results(matches, filename): | |
"""Display the search results tidily with colored text.""" | |
print(f"Match found in {Fore.YELLOW}{filename}{Style.RESET_ALL}:") | |
for i, match in enumerate(matches, 1): | |
print( | |
f"{i}. ...{match[1]}" | |
f"{Fore.MAGENTA + match[2] + Style.RESET_ALL}" | |
f"{match[3]}..." | |
) | |
search = input("Enter some text you would like to search: " + Fore.CYAN) | |
print(f"\n{Fore.GREEN}===== Results ====={Style.RESET_ALL}") | |
files = Path.cwd().rglob("[!~]*.docx") | |
# Initiliaze colorama to filter ANSI escape sequences | |
init() | |
for file in files: | |
filename = Path(file.parent.name) / Path(file.name) | |
fulltext = get_fulltext(file) | |
matches = find(fulltext, search) | |
if matches: | |
display_results(matches, filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment