Last active
October 9, 2023 17:13
-
-
Save 0187773933/47ae1f15a0d2e2d567890deecf9fce06 to your computer and use it in GitHub Desktop.
Extract Images From PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import io | |
from pprint import pprint | |
from pathlib import Path | |
from tqdm import tqdm | |
from binascii import b2a_hex | |
import shutil | |
import tempfile | |
from pdfminer.high_level import extract_pages | |
from pdfminer.pdfparser import PDFParser | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfinterp import resolve1 | |
from PIL import Image , ImageFile | |
ImageFile.LOAD_TRUNCATED_IMAGES = True | |
def get_meta_data( input_file_path ): | |
input_file = open( input_file_path , 'rb' ) | |
input_parser = PDFParser( input_file ) | |
input_document = PDFDocument( input_parser ) | |
total_pages = resolve1( input_document.catalog['Pages'] )['Count'] | |
return { | |
"total_pages": total_pages , | |
} | |
def get_page_layouts( input_file_path , total_pages ): | |
generator = extract_pages( input_file_path ) | |
print( "Scanning Page Layouts" ) | |
page_layouts = [] | |
for page_layout in tqdm( generator , total=total_pages ): | |
page_layouts.append( page_layout ) | |
return page_layouts | |
def get_lt_figures( page_layout ): | |
lt_figures = [] | |
for index , element in enumerate( page_layout ): | |
class_name = type(element).__name__ | |
if class_name == "LTTextBoxHorizontal": | |
pass | |
elif class_name == "LTRect": | |
pass | |
elif class_name == "LTFigure": | |
lt_figures.append( element ) | |
elif class_name == "LTCurve": | |
pass | |
elif class_name == "LTLine": | |
pass | |
else: | |
print( class_name ) | |
return lt_figures | |
def get_image_type( stream_first_4_bytes ): | |
file_type = False | |
bytes_as_hex = b2a_hex( stream_first_4_bytes ).decode() | |
if bytes_as_hex.startswith( 'ffd8' ): | |
file_type = 'jpeg' | |
elif bytes_as_hex == '89504e47': | |
file_type = 'png' | |
elif bytes_as_hex == '47494638': | |
file_type = 'gif' | |
elif bytes_as_hex.startswith('424d'): | |
file_type = 'bmp' | |
else: | |
print( bytes_as_hex ) | |
return file_type | |
# def get_image_info_from_stream_raw_data( stream ): | |
# try: | |
# image = Image.open( io.BytesIO( element._objs[ 0 ].stream.rawdata ) ) | |
# return { | |
# "type": get_image_type( element._objs[ 0 ].stream.rawdata[ 0 : 4 ] ) , | |
# "width": element._objs[ 0 ].width , | |
# "height": element._objs[ 0 ].height , | |
# "bounding_box": { | |
# 'x1': element._objs[ 0 ].x0 , 'y1': element._objs[ 0 ].y0 , | |
# 'x2': element._objs[ 0 ].x1 , 'y2': element._objs[ 0 ].y1 , | |
# } , | |
# "pil_image": image | |
# } | |
# except Exception as e: | |
# print( e ) | |
# return False | |
def get_image_info( element ): | |
try: | |
# print( element._objs[ 0 ].stream.rawdata ) | |
image = Image.open( io.BytesIO( element._objs[ 0 ].stream.rawdata ) ) | |
return { | |
"type": get_image_type( element._objs[ 0 ].stream.rawdata[ 0 : 4 ] ) , | |
"width": element._objs[ 0 ].width , | |
"height": element._objs[ 0 ].height , | |
"bounding_box": { | |
'x1': element._objs[ 0 ].x0 , 'y1': element._objs[ 0 ].y0 , | |
'x2': element._objs[ 0 ].x1 , 'y2': element._objs[ 0 ].y1 , | |
} , | |
"pil_image": image | |
} | |
except Exception as e: | |
# print( "Failed to Parse Raw Data" ) | |
# print( e ) | |
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes | |
# pillow_modes = [ "1" , "L" , "P" , "RGB" , "RGBA" , "CMYK" , "YCbCr" , "LAB" , "HSV" , "I" , "F" , "LA" , "PA" ] | |
# pillow_extra_modes = [ "LA" , "PA" , "RGBX" , "RGBa" , "La" , "I;16" , "I;16L" , "I;16B" , "I;16N" , "BGR;15" , "BGR;16" , "BGR;24" , "BGR;32" ] | |
#pillow_modes.extend( pillow_extra_modes ) | |
pillow_modes = [ "RGB" ] | |
candidate_images = [] | |
try: | |
# image_bytes = io.BytesIO( element._objs[ 0 ].stream.rawdata ) | |
# image = Image.frombytes( '1' , image_object.srcsize, image_object.stream.get_data(), 'raw') | |
# with open(image_object.name + '.bmp', 'wb') as fp: | |
# image.save(fp) | |
# print( vars( element._objs[ 0 ] ) ) | |
# color_bits = element._objs[ 0 ].bits | |
# image_type = "png" | |
# print( vars( element._objs[ 0 ].colorspace[0] ) ) | |
# print( vars( element._objs[ 0 ].colorspace[0].doc ) ) | |
# https://github.com/pdfminer/pdfminer.six/issues/144 | |
# https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=frombytes#PIL.Image.frombytes | |
for index , pillow_mode in enumerate( pillow_modes ): | |
try: | |
if "srcsize" not in element._objs[ 0 ]: | |
source_size = ( element._objs[ 0 ].width , element._objs[ 0 ].height ) | |
else: | |
source_size = element._objs[ 0 ].srcsize | |
if "stream" not in element._objs[ 0 ]: | |
#print( "here ?????" ) | |
#print( vars( element._objs[ 0 ]._objs[ 0 ] ) ) | |
image = Image.open( io.BytesIO( element._objs[ 0 ]._objs[ 0 ].stream.rawdata ) ) | |
return { | |
"type": get_image_type( element._objs[ 0 ]._objs[ 0 ].stream.rawdata[ 0 : 4 ] ) , | |
"width": element._objs[ 0 ].width , | |
"height": element._objs[ 0 ].height , | |
"bounding_box": { | |
'x1': element._objs[ 0 ].x0 , 'y1': element._objs[ 0 ].y0 , | |
'x2': element._objs[ 0 ].x1 , 'y2': element._objs[ 0 ].y1 , | |
} , | |
"pil_image": image | |
} | |
else: | |
figure_bytes = element._objs[ 0 ].stream.get_data() | |
image = Image.frombytes( pillow_mode , source_size , figure_bytes , 'raw' ) | |
candidate_images.append({ | |
"type": "png" , | |
"mode": pillow_mode , | |
"width": element._objs[ 0 ].width , | |
"height": element._objs[ 0 ].height , | |
"bounding_box": { | |
'x1': element._objs[ 0 ].x0 , 'y1': element._objs[ 0 ].y0 , | |
'x2': element._objs[ 0 ].x1 , 'y2': element._objs[ 0 ].y1 , | |
} , | |
"pil_image": image | |
}) | |
except Exception as e: | |
print( e ) | |
pass | |
# pprint( candidate_images ) | |
return candidate_images | |
except Exception as e: | |
#print( e ) | |
return False | |
# print( vars( element._objs[ 0 ].stream ) ) | |
# image.show() | |
if __name__ == "__main__": | |
input_file_path_posix = Path( sys.argv[ 1 ] ) | |
output_base_path = input_file_path_posix.parent.joinpath( input_file_path_posix.stem ) | |
shutil.rmtree( output_base_path , ignore_errors=True ) | |
output_base_path.mkdir( parents=True , exist_ok=True ) | |
meta_data = get_meta_data( sys.argv[ 1 ] ) | |
page_layouts = get_page_layouts( sys.argv[ 1 ] , meta_data["total_pages"] ) | |
for page_layout_index , page_layout in enumerate( page_layouts ): | |
figures = get_lt_figures( page_layout ) | |
for figure_index , figure in enumerate( figures ): | |
image_info = get_image_info( figure ) | |
if image_info is not False: | |
if isinstance( image_info , list ): | |
for image_info_index , candidate in enumerate( image_info ): | |
try: | |
candidate["image_save_path"] = output_base_path.joinpath( f"Page - {page_layout_index+1} - {figure_index+1} - {candidate['mode']}.{candidate['type']}" ) | |
pprint( image_info ) | |
candidate["pil_image"].save( str( candidate["image_save_path"] ) ) | |
except Exception as e: | |
print( e ) | |
else: | |
if image_info["type"] is not False: | |
image_info["image_save_path"] = output_base_path.joinpath( f"Page - {page_layout_index+1} - {figure_index+1}.{image_info['type']}" ) | |
pprint( image_info ) | |
image_info["pil_image"].save( str( image_info["image_save_path"] ) ) | |
else: | |
pprint( image_info ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment