dexeonify/search_docx.py

## search_docx.py
"""Search through Word documents and print the related matches."""

from colorama import init, Fore, Style
from pathlib import Path
import re

# This is a hacky workaround to enable autocompletion for python-docx
# See here: https://stackoverflow.com/q/42968369/16689935
from docx.document import Document

# Do note that the above `Document` class is only used for auto-complete.
# The actual `Document` method is going to be imported on runtime,
# see the following try-except block:
try:
    document = Document()
except TypeError:
    # This is the actual `Document` method that's going to be used.
    # Upon erroring (which it inevitably will), this import will override the
    # previous `Document` import so the code will function.
    from docx import Document


def get_fulltext(path):
    """
    Extract text from Word documents and return the results in a long string.
    """
    doc = Document(path)
    textlist = []

    for paragraph in doc.paragraphs:
        textlist.append(paragraph.text)

    # Extract text from tables if available
    try:
        table = doc.tables[0]

        for row in table.rows:
            for cell in row.cells:
                textlist.append(cell.text)
    except IndexError:
        pass

    return " ".join(textlist)


def find(text, search):
    """
    Find the keyword you want using regex. Additionally, include 3 words before
    and after the keyword for context. Return the matches in a list.
    """
    # Escape spaces with a backslash since spaces are ignored in verbose mode.
    escaped = search.replace(" ", "\\ ")

    # Regex to preview the previous and next 3 words.
    # Regex101: https://regex101.com/r/AbASbx/2
    # Source: https://stackoverflow.com/a/55492108/16689935
    search_regex = re.compile(
        rf"""(
        ((?:\w+\W*){{0,3}})    # Show 3 words before the match
        ({escaped})            # The actual match
        ((?:\W*\w+){{0,3}})    # Show 3 words after the match
    )""",
        re.VERBOSE | re.IGNORECASE,
    )

    matches = search_regex.findall(text)
    return matches


def display_results(matches, filename):
    """Display the search results tidily with colored text."""
    print(f"Match found in {Fore.YELLOW}{filename}{Style.RESET_ALL}:")

    for i, match in enumerate(matches, 1):
        print(
            f"{i}. ...{match[1]}"
            f"{Fore.MAGENTA + match[2] + Style.RESET_ALL}"
            f"{match[3]}..."
        )


search = input("Enter some text you would like to search: " + Fore.CYAN)
print(f"\n{Fore.GREEN}===== Results ====={Style.RESET_ALL}")
files = Path.cwd().rglob("[!~]*.docx")

# Initiliaze colorama to filter ANSI escape sequences
init()

for file in files:
    filename = Path(file.parent.name) / Path(file.name)
    fulltext = get_fulltext(file)
    matches = find(fulltext, search)

    if matches:
        display_results(matches, filename)
	"""Search through Word documents and print the related matches."""

	from colorama import init, Fore, Style
	from pathlib import Path
	import re

	# This is a hacky workaround to enable autocompletion for python-docx
	# See here: https://stackoverflow.com/q/42968369/16689935
	from docx.document import Document

	# Do note that the above `Document` class is only used for auto-complete.
	# The actual `Document` method is going to be imported on runtime,
	# see the following try-except block:
	try:
	document = Document()
	except TypeError:
	# This is the actual `Document` method that's going to be used.
	# Upon erroring (which it inevitably will), this import will override the
	# previous `Document` import so the code will function.
	from docx import Document


	def get_fulltext(path):
	"""
	Extract text from Word documents and return the results in a long string.
	"""
	doc = Document(path)
	textlist = []

	for paragraph in doc.paragraphs:
	textlist.append(paragraph.text)

	# Extract text from tables if available
	try:
	table = doc.tables[0]

	for row in table.rows:
	for cell in row.cells:
	textlist.append(cell.text)
	except IndexError:
	pass

	return " ".join(textlist)


	def find(text, search):
	"""
	Find the keyword you want using regex. Additionally, include 3 words before
	and after the keyword for context. Return the matches in a list.
	"""
	# Escape spaces with a backslash since spaces are ignored in verbose mode.
	escaped = search.replace(" ", "\\ ")

	# Regex to preview the previous and next 3 words.
	# Regex101: https://regex101.com/r/AbASbx/2
	# Source: https://stackoverflow.com/a/55492108/16689935
	search_regex = re.compile(
	rf"""(
	((?:\w+\W*){{0,3}}) # Show 3 words before the match
	({escaped}) # The actual match
	((?:\W*\w+){{0,3}}) # Show 3 words after the match
	)""",
	re.VERBOSE \| re.IGNORECASE,
	)

	matches = search_regex.findall(text)
	return matches


	def display_results(matches, filename):
	"""Display the search results tidily with colored text."""
	print(f"Match found in {Fore.YELLOW}{filename}{Style.RESET_ALL}:")

	for i, match in enumerate(matches, 1):
	print(
	f"{i}. ...{match[1]}"
	f"{Fore.MAGENTA + match[2] + Style.RESET_ALL}"
	f"{match[3]}..."
	)


	search = input("Enter some text you would like to search: " + Fore.CYAN)
	print(f"\n{Fore.GREEN}===== Results ====={Style.RESET_ALL}")
	files = Path.cwd().rglob("[!~]*.docx")

	# Initiliaze colorama to filter ANSI escape sequences
	init()

	for file in files:
	filename = Path(file.parent.name) / Path(file.name)
	fulltext = get_fulltext(file)
	matches = find(fulltext, search)

	if matches:
	display_results(matches, filename)