Skip to content

Instantly share code, notes, and snippets.

@rguliev
Forked from terencezl/convert_pdf.py
Last active March 25, 2024 21:12
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save rguliev/3d886d38daa8ac0be8ddb85d645fb0bc to your computer and use it in GitHub Desktop.
Save rguliev/3d886d38daa8ac0be8ddb85d645fb0bc to your computer and use it in GitHub Desktop.
Python 3: pdfminer code to convert pdf to text, html or xml
# Use `pip3 install pdfminer.six` for python3
from typing import Container
from io import BytesIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert_pdf(
path: str,
format: str = "text",
codec: str = "utf-8",
password: str = "",
maxpages: int = 0,
caching: bool = True,
pagenos: Container[int] = set(),
) -> str:
"""Summary
Parameters
----------
path : str
Path to the pdf file
format : str, optional
Format of output, must be one of: "text", "html", "xml".
By default, "text" format is used
codec : str, optional
Encoding. By default "utf-8" is used
password : str, optional
Password
maxpages : int, optional
Max number of pages to convert. By default is 0, i.e. reads all pages.
caching : bool, optional
Caching. By default is True
pagenos : Container[int], optional
Provide a list with numbers of pages to convert
Returns
-------
str
Converted pdf file
"""
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
laparams = LAParams()
if format == "text":
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == "html":
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == "xml":
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
else:
raise ValueError("provide format, either text, html or xml!")
fp = open(path, "rb")
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(
fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
return text
@devanghingu
Copy link

what if i want to convert each page as different HTML page ?

@rguliev
Copy link
Author

rguliev commented Apr 20, 2022

Sorry, the code is quite old and I do not remember all the context. I guess you can just split your file into multiple single-page files and apply this function

@bbbuserkz
Copy link

thnks

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment