oliver-batey/common_interface.py

## common_interface.py
import os
import io
from docx import Document

from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter

from bs4 import BeautifulSoup

from pptx import Presentation


class DocParser:
    def parse(self,document):
        parser = get_format(document)
        return parser(document)

def get_format(document):
    format = os.path.splitext(document)[-1]
    return get_parser(format)

def get_parser(format):
    if format == '.txt':
        return parse_txt
    elif format == '.docx':
        return parse_docx
    elif format == '.pdf':
        return parse_pdf
    elif format == '.html':
        return parse_html
    elif format == '.pptx':
        return parse_pptx
    else:
        raise ValueError(format)

def parse_txt(document):
    with open(document, 'r') as file:
        sting = file.read().replace('\n', ' ')
    return string

def parse_docx(document):
    doc = Document(document)
    string = ''
    for para in doc.paragraphs:
        string += f'{para.text} '
    return string

def parse_pdf(document):
    resource_manager = PDFResourceManager()
    file_handle = io.StringIO()
    converter = TextConverter(resource_manager, file_handle, laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(document, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
        string = file_handle.getvalue()
    converter.close()
    file_handle.close()
    return string

def parse_html(document):
    html = open(document)
    soup = BeautifulSoup(html, features="html.parser")
    for script in soup(["script", "style"]):
        script.extract()
    return soup.get_text()

def parse_pptx(document):
    pres = Presentation(document)
    string = ''
    for slide in pres.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                string+=f'{shape.text} '
    return string
	import os
	import io
	from docx import Document

	from pdfminer3.layout import LAParams, LTTextBox
	from pdfminer3.pdfpage import PDFPage
	from pdfminer3.pdfinterp import PDFResourceManager
	from pdfminer3.pdfinterp import PDFPageInterpreter
	from pdfminer3.converter import PDFPageAggregator
	from pdfminer3.converter import TextConverter

	from bs4 import BeautifulSoup

	from pptx import Presentation


	class DocParser:
	def parse(self,document):
	parser = get_format(document)
	return parser(document)

	def get_format(document):
	format = os.path.splitext(document)[-1]
	return get_parser(format)

	def get_parser(format):
	if format == '.txt':
	return parse_txt
	elif format == '.docx':
	return parse_docx
	elif format == '.pdf':
	return parse_pdf
	elif format == '.html':
	return parse_html
	elif format == '.pptx':
	return parse_pptx
	else:
	raise ValueError(format)

	def parse_txt(document):
	with open(document, 'r') as file:
	sting = file.read().replace('\n', ' ')
	return string

	def parse_docx(document):
	doc = Document(document)
	string = ''
	for para in doc.paragraphs:
	string += f'{para.text} '
	return string

	def parse_pdf(document):
	resource_manager = PDFResourceManager()
	file_handle = io.StringIO()
	converter = TextConverter(resource_manager, file_handle, laparams=LAParams())
	page_interpreter = PDFPageInterpreter(resource_manager, converter)
	with open(document, 'rb') as fh:
	for page in PDFPage.get_pages(fh,
	caching=True,
	check_extractable=True):
	page_interpreter.process_page(page)
	string = file_handle.getvalue()
	converter.close()
	file_handle.close()
	return string

	def parse_html(document):
	html = open(document)
	soup = BeautifulSoup(html, features="html.parser")
	for script in soup(["script", "style"]):
	script.extract()
	return soup.get_text()

	def parse_pptx(document):
	pres = Presentation(document)
	string = ''
	for slide in pres.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	string+=f'{shape.text} '
	return string