Created
February 3, 2023 00:31
-
-
Save Carl4/5d7ad5d94f577a06868b77232184e0b3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import docx | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import PorterStemmer | |
from typing import Dict | |
class WordParser: | |
""" | |
A class that parses the text in Microsoft Word documents and tracks keywords using natural language processing techniques. | |
""" | |
def __init__(self, file_path: str): | |
""" | |
Initialize a WordParser instance with the path to the Microsoft Word document. | |
:param file_path: The path to the Microsoft Word document | |
""" | |
self.file_path = file_path | |
self.keywords: Dict[str, int] = {} | |
self.stop_words = set(nltk.corpus.stopwords.words("english")) | |
def parse_keywords(self): | |
""" | |
Parse the text in the Microsoft Word document and track the keywords. | |
""" | |
document = docx.Document(self.file_path) | |
stemmer = PorterStemmer() | |
for para in document.paragraphs: | |
words = word_tokenize(para.text) | |
stemmed_words = [stemmer.stem(word.lower()) for word in words if word.lower() not in self.stop_words] | |
for word in stemmed_words: | |
if word in self.keywords: | |
self.keywords[word] += 1 | |
else: | |
self.keywords[word] = 1 | |
def get_keywords(self) -> Dict[str, int]: | |
""" | |
Return a dictionary of the keywords and their frequency. | |
:return: A dictionary of the keywords and their frequency | |
""" | |
return self.keywords |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment