Skip to content

Instantly share code, notes, and snippets.

@dexeonify
Last active July 25, 2022 14:28
Show Gist options
  • Save dexeonify/c828538f3c76ee936fffbe8127213917 to your computer and use it in GitHub Desktop.
Save dexeonify/c828538f3c76ee936fffbe8127213917 to your computer and use it in GitHub Desktop.
A glorified script to recursively search a keyword through directories of Microsoft Word .docx files.
"""Search through Word documents and print the related matches."""
from colorama import init, Fore, Style
from pathlib import Path
import re
# This is a hacky workaround to enable autocompletion for python-docx
# See here: https://stackoverflow.com/q/42968369/16689935
from docx.document import Document
# Do note that the above `Document` class is only used for auto-complete.
# The actual `Document` method is going to be imported on runtime,
# see the following try-except block:
try:
document = Document()
except TypeError:
# This is the actual `Document` method that's going to be used.
# Upon erroring (which it inevitably will), this import will override the
# previous `Document` import so the code will function.
from docx import Document
def get_fulltext(path):
"""
Extract text from Word documents and return the results in a long string.
"""
doc = Document(path)
textlist = []
for paragraph in doc.paragraphs:
textlist.append(paragraph.text)
# Extract text from tables if available
try:
table = doc.tables[0]
for row in table.rows:
for cell in row.cells:
textlist.append(cell.text)
except IndexError:
pass
return " ".join(textlist)
def find(text, search):
"""
Find the keyword you want using regex. Additionally, include 3 words before
and after the keyword for context. Return the matches in a list.
"""
# Escape spaces with a backslash since spaces are ignored in verbose mode.
escaped = search.replace(" ", "\\ ")
# Regex to preview the previous and next 3 words.
# Regex101: https://regex101.com/r/AbASbx/2
# Source: https://stackoverflow.com/a/55492108/16689935
search_regex = re.compile(
rf"""(
((?:\w+\W*){{0,3}}) # Show 3 words before the match
({escaped}) # The actual match
((?:\W*\w+){{0,3}}) # Show 3 words after the match
)""",
re.VERBOSE | re.IGNORECASE,
)
matches = search_regex.findall(text)
return matches
def display_results(matches, filename):
"""Display the search results tidily with colored text."""
print(f"Match found in {Fore.YELLOW}{filename}{Style.RESET_ALL}:")
for i, match in enumerate(matches, 1):
print(
f"{i}. ...{match[1]}"
f"{Fore.MAGENTA + match[2] + Style.RESET_ALL}"
f"{match[3]}..."
)
search = input("Enter some text you would like to search: " + Fore.CYAN)
print(f"\n{Fore.GREEN}===== Results ====={Style.RESET_ALL}")
files = Path.cwd().rglob("[!~]*.docx")
# Initiliaze colorama to filter ANSI escape sequences
init()
for file in files:
filename = Path(file.parent.name) / Path(file.name)
fulltext = get_fulltext(file)
matches = find(fulltext, search)
if matches:
display_results(matches, filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment