Skip to content

Instantly share code, notes, and snippets.

@Wesitos
Last active April 10, 2017 02:32
Show Gist options
  • Save Wesitos/8edfc90415e2dab0e4d59f28ce2a7bac to your computer and use it in GitHub Desktop.
Save Wesitos/8edfc90415e2dab0e4d59f28ce2a7bac to your computer and use it in GitHub Desktop.
Extraer imagenes de un pdf
# Use python 3
from PyPDF2 import PdfFileReader
from bs4 import BeautifulSoup
import os
def flatten(l):
"""Aplana una lista"""
if type(l) is not list:
return [l]
if len(l) is 0:
return []
head, tail = l[0], l[1:]
return flatten(head) + flatten(tail)
def extract_images_from_resources(resources):
x_object = resources.get('/XObject')
if x_object is None:
return []
d_objects = {
# Nos aseguramos que tengamos el objeto
# y no solo una "referencia"
name: obj.getObject()
for (name, obj) in x_object.items()
}
return flatten([(
obj
if obj['/Subtype'] == '/Image'
else extract_images_from_resources(obj['/Resources'])
) for (name, obj) in d_objects.items()
])
def extract_images_from_reader(reader):
return [
extract_images_from_resources(reader.getPage(n)['/Resources'])
for n in range(reader.numPages)
]
format_ext_dict = {
'image/jpeg': '.jpg',
# Las imagenes de portada y la contraportada indican que son formato Tiff
# pero en realidad son jpeg
'image/tiff': '.jpg',
}
with open('libro.pdf', 'rb') as f_pdf:
reader = PdfFileReader(f_pdf)
l_imgs = extract_images_from_reader(reader)
# Creamos el directorio
os.makedirs('img', exist_ok=True)
for page, imgs in enumerate(l_imgs):
for n, img in enumerate(imgs):
data = img._data
metadata = img['/Metadata']._data
# Suponemos que la metadata siempre estara formateada en xml
metadata_soup = BeautifulSoup(metadata, 'lxml-xml')
# Extraemos el mimetype de la metadata
mimetype = metadata_soup.find('Description').attrs.get('format')
image_path = os.path.join(
'img', 'page-{}-img-{}{}'.format(
page+1, n, format_ext_dict.get(mimetype, '')))
with open(image_path, 'wb') as f:
f.write(data)
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment