Skip to content

Instantly share code, notes, and snippets.

@johndonnelly
Forked from oliver-batey/common_interface.py
Created January 21, 2021 15:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johndonnelly/9c6e939e55be02a5fab8513b2db1d710 to your computer and use it in GitHub Desktop.
Save johndonnelly/9c6e939e55be02a5fab8513b2db1d710 to your computer and use it in GitHub Desktop.
Common interface for parsing txt, docx, pdf, html and pptx
import os
import io
from docx import Document
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
from bs4 import BeautifulSoup
from pptx import Presentation
class DocParser:
def parse(self,document):
parser = get_format(document)
return parser(document)
def get_format(document):
format = os.path.splitext(document)[-1]
return get_parser(format)
def get_parser(format):
if format == '.txt':
return parse_txt
elif format == '.docx':
return parse_docx
elif format == '.pdf':
return parse_pdf
elif format == '.html':
return parse_html
elif format == '.pptx':
return parse_pptx
else:
raise ValueError(format)
def parse_txt(document):
with open(document, 'r') as file:
sting = file.read().replace('\n', ' ')
return string
def parse_docx(document):
doc = Document(document)
string = ''
for para in doc.paragraphs:
string += f'{para.text} '
return string
def parse_pdf(document):
resource_manager = PDFResourceManager()
file_handle = io.StringIO()
converter = TextConverter(resource_manager, file_handle, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(document, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
string = file_handle.getvalue()
converter.close()
file_handle.close()
return string
def parse_html(document):
html = open(document)
soup = BeautifulSoup(html, features="html.parser")
for script in soup(["script", "style"]):
script.extract()
return soup.get_text()
def parse_pptx(document):
pres = Presentation(document)
string = ''
for slide in pres.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
string+=f'{shape.text} '
return string
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment