-
-
Save johndonnelly/9c6e939e55be02a5fab8513b2db1d710 to your computer and use it in GitHub Desktop.
Common interface for parsing txt, docx, pdf, html and pptx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import io | |
from docx import Document | |
from pdfminer3.layout import LAParams, LTTextBox | |
from pdfminer3.pdfpage import PDFPage | |
from pdfminer3.pdfinterp import PDFResourceManager | |
from pdfminer3.pdfinterp import PDFPageInterpreter | |
from pdfminer3.converter import PDFPageAggregator | |
from pdfminer3.converter import TextConverter | |
from bs4 import BeautifulSoup | |
from pptx import Presentation | |
class DocParser: | |
def parse(self,document): | |
parser = get_format(document) | |
return parser(document) | |
def get_format(document): | |
format = os.path.splitext(document)[-1] | |
return get_parser(format) | |
def get_parser(format): | |
if format == '.txt': | |
return parse_txt | |
elif format == '.docx': | |
return parse_docx | |
elif format == '.pdf': | |
return parse_pdf | |
elif format == '.html': | |
return parse_html | |
elif format == '.pptx': | |
return parse_pptx | |
else: | |
raise ValueError(format) | |
def parse_txt(document): | |
with open(document, 'r') as file: | |
sting = file.read().replace('\n', ' ') | |
return string | |
def parse_docx(document): | |
doc = Document(document) | |
string = '' | |
for para in doc.paragraphs: | |
string += f'{para.text} ' | |
return string | |
def parse_pdf(document): | |
resource_manager = PDFResourceManager() | |
file_handle = io.StringIO() | |
converter = TextConverter(resource_manager, file_handle, laparams=LAParams()) | |
page_interpreter = PDFPageInterpreter(resource_manager, converter) | |
with open(document, 'rb') as fh: | |
for page in PDFPage.get_pages(fh, | |
caching=True, | |
check_extractable=True): | |
page_interpreter.process_page(page) | |
string = file_handle.getvalue() | |
converter.close() | |
file_handle.close() | |
return string | |
def parse_html(document): | |
html = open(document) | |
soup = BeautifulSoup(html, features="html.parser") | |
for script in soup(["script", "style"]): | |
script.extract() | |
return soup.get_text() | |
def parse_pptx(document): | |
pres = Presentation(document) | |
string = '' | |
for slide in pres.slides: | |
for shape in slide.shapes: | |
if hasattr(shape, "text"): | |
string+=f'{shape.text} ' | |
return string |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment