Carl4/doc_parse.py

## doc_parse.py
import docx
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from typing import Dict

class WordParser:
    """
    A class that parses the text in Microsoft Word documents and tracks keywords using natural language processing techniques.
    """
    def __init__(self, file_path: str):
        """
        Initialize a WordParser instance with the path to the Microsoft Word document.

        :param file_path: The path to the Microsoft Word document
        """
        self.file_path = file_path
        self.keywords: Dict[str, int] = {}
        self.stop_words = set(nltk.corpus.stopwords.words("english"))

    def parse_keywords(self):
        """
        Parse the text in the Microsoft Word document and track the keywords.
        """
        document = docx.Document(self.file_path)
        stemmer = PorterStemmer()

        for para in document.paragraphs:
            words = word_tokenize(para.text)
            stemmed_words = [stemmer.stem(word.lower()) for word in words if word.lower() not in self.stop_words]

            for word in stemmed_words:
                if word in self.keywords:
                    self.keywords[word] += 1
                else:
                    self.keywords[word] = 1

    def get_keywords(self) -> Dict[str, int]:
        """
        Return a dictionary of the keywords and their frequency.

        :return: A dictionary of the keywords and their frequency
        """
        return self.keywords
	import docx
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.stem import PorterStemmer
	from typing import Dict

	class WordParser:
	"""
	A class that parses the text in Microsoft Word documents and tracks keywords using natural language processing techniques.
	"""
	def __init__(self, file_path: str):
	"""
	Initialize a WordParser instance with the path to the Microsoft Word document.

	:param file_path: The path to the Microsoft Word document
	"""
	self.file_path = file_path
	self.keywords: Dict[str, int] = {}
	self.stop_words = set(nltk.corpus.stopwords.words("english"))

	def parse_keywords(self):
	"""
	Parse the text in the Microsoft Word document and track the keywords.
	"""
	document = docx.Document(self.file_path)
	stemmer = PorterStemmer()

	for para in document.paragraphs:
	words = word_tokenize(para.text)
	stemmed_words = [stemmer.stem(word.lower()) for word in words if word.lower() not in self.stop_words]

	for word in stemmed_words:
	if word in self.keywords:
	self.keywords[word] += 1
	else:
	self.keywords[word] = 1

	def get_keywords(self) -> Dict[str, int]:
	"""
	Return a dictionary of the keywords and their frequency.

	:return: A dictionary of the keywords and their frequency
	"""
	return self.keywords