Skip to content

Instantly share code, notes, and snippets.

@oliver-batey
Last active January 21, 2021 15:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save oliver-batey/8e125cd8d4c3f7ca2df8fde002fa0a30 to your computer and use it in GitHub Desktop.
Save oliver-batey/8e125cd8d4c3f7ca2df8fde002fa0a30 to your computer and use it in GitHub Desktop.
Common interface for parsing txt, docx, pdf, html and pptx
import os
import io
from docx import Document
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
from bs4 import BeautifulSoup
from pptx import Presentation
class DocParser:
def parse(self,document):
parser = get_format(document)
return parser(document)
def get_format(document):
format = os.path.splitext(document)[-1]
return get_parser(format)
def get_parser(format):
if format == '.txt':
return parse_txt
elif format == '.docx':
return parse_docx
elif format == '.pdf':
return parse_pdf
elif format == '.html':
return parse_html
elif format == '.pptx':
return parse_pptx
else:
raise ValueError(format)
def parse_txt(document):
with open(document, 'r') as file:
sting = file.read().replace('\n', ' ')
return string
def parse_docx(document):
doc = Document(document)
string = ''
for para in doc.paragraphs:
string += f'{para.text} '
return string
def parse_pdf(document):
resource_manager = PDFResourceManager()
file_handle = io.StringIO()
converter = TextConverter(resource_manager, file_handle, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(document, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
string = file_handle.getvalue()
converter.close()
file_handle.close()
return string
def parse_html(document):
html = open(document)
soup = BeautifulSoup(html, features="html.parser")
for script in soup(["script", "style"]):
script.extract()
return soup.get_text()
def parse_pptx(document):
pres = Presentation(document)
string = ''
for slide in pres.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
string+=f'{shape.text} '
return string
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment