Last active
April 10, 2017 02:32
-
-
Save Wesitos/8edfc90415e2dab0e4d59f28ce2a7bac to your computer and use it in GitHub Desktop.
Extraer imagenes de un pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Use python 3 | |
from PyPDF2 import PdfFileReader | |
from bs4 import BeautifulSoup | |
import os | |
def flatten(l): | |
"""Aplana una lista""" | |
if type(l) is not list: | |
return [l] | |
if len(l) is 0: | |
return [] | |
head, tail = l[0], l[1:] | |
return flatten(head) + flatten(tail) | |
def extract_images_from_resources(resources): | |
x_object = resources.get('/XObject') | |
if x_object is None: | |
return [] | |
d_objects = { | |
# Nos aseguramos que tengamos el objeto | |
# y no solo una "referencia" | |
name: obj.getObject() | |
for (name, obj) in x_object.items() | |
} | |
return flatten([( | |
obj | |
if obj['/Subtype'] == '/Image' | |
else extract_images_from_resources(obj['/Resources']) | |
) for (name, obj) in d_objects.items() | |
]) | |
def extract_images_from_reader(reader): | |
return [ | |
extract_images_from_resources(reader.getPage(n)['/Resources']) | |
for n in range(reader.numPages) | |
] | |
format_ext_dict = { | |
'image/jpeg': '.jpg', | |
# Las imagenes de portada y la contraportada indican que son formato Tiff | |
# pero en realidad son jpeg | |
'image/tiff': '.jpg', | |
} | |
with open('libro.pdf', 'rb') as f_pdf: | |
reader = PdfFileReader(f_pdf) | |
l_imgs = extract_images_from_reader(reader) | |
# Creamos el directorio | |
os.makedirs('img', exist_ok=True) | |
for page, imgs in enumerate(l_imgs): | |
for n, img in enumerate(imgs): | |
data = img._data | |
metadata = img['/Metadata']._data | |
# Suponemos que la metadata siempre estara formateada en xml | |
metadata_soup = BeautifulSoup(metadata, 'lxml-xml') | |
# Extraemos el mimetype de la metadata | |
mimetype = metadata_soup.find('Description').attrs.get('format') | |
image_path = os.path.join( | |
'img', 'page-{}-img-{}{}'.format( | |
page+1, n, format_ext_dict.get(mimetype, ''))) | |
with open(image_path, 'wb') as f: | |
f.write(data) |
View raw
(Sorry about that, but we can’t show files that are this big right now.)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment