Created
February 15, 2022 22:04
-
-
Save delannoy/0db41032713f1256e63ab619c0a0f19d to your computer and use it in GitHub Desktop.
Extract all Images from PDF (PyMuPDF, Pillow)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# [Extract all Images from PDF in Python](https://aliarefwriorr.medium.com/extract-all-images-from-pdf-in-python-cda3dc195abd) | |
import io | |
import fitz | |
import os | |
import PIL.Image | |
import requests | |
def downloadPDF(url: str, filename: str): | |
print(f'downloading {url}...') | |
with open(filename, mode='wb') as pdf: | |
pdf.write(requests.get(url).content) | |
def extract_images(pdf: fitz.Document, page: int, imgDir: str = 'img'): | |
imageList = pdf[page].get_images() | |
os.makedirs(imgDir, exist_ok=True) | |
if imageList: | |
print(page) | |
for idx, img in enumerate(imageList, start=1): | |
data = pdf.extract_image(img[0]) | |
with PIL.Image.open(io.BytesIO(data.get('image'))) as image: | |
image.save(f'{imgDir}/{page}-{idx}.{data.get("ext")}', mode='wb') | |
def main(url: str = 'https://cds.cern.ch/record/357153/files/HCAL_TDR1997.pdf'): | |
filename = url.split('/')[-1] | |
downloadPDF(url=url, filename=filename) | |
pdf = fitz.open(filename) | |
for page in range(pdf.page_count): | |
extract_images(pdf, page) | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
python3 -m venv pymupdf | |
source pymupdf/bin/activate | |
python3 -m pip install PyMuPDF Pillow | |
python3 -m extractImagesPDF |
I would suggest you contact the author of the linked article (Extract all Images from PDF in Python) since the code derives from there.
Thank you, I will contact him
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks a lot for this script. Does it have a license? I want to create derivative changes by creating a bigger python program.