Skip to content

Instantly share code, notes, and snippets.

@0187773933
Last active October 9, 2023 17:13
Show Gist options
  • Save 0187773933/47ae1f15a0d2e2d567890deecf9fce06 to your computer and use it in GitHub Desktop.
Save 0187773933/47ae1f15a0d2e2d567890deecf9fce06 to your computer and use it in GitHub Desktop.
Extract Images From PDF
#!/usr/bin/env python3
import sys
import io
from pprint import pprint
from pathlib import Path
from tqdm import tqdm
from binascii import b2a_hex
import shutil
import tempfile
from pdfminer.high_level import extract_pages
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import resolve1
from PIL import Image , ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
def get_meta_data( input_file_path ):
input_file = open( input_file_path , 'rb' )
input_parser = PDFParser( input_file )
input_document = PDFDocument( input_parser )
total_pages = resolve1( input_document.catalog['Pages'] )['Count']
return {
"total_pages": total_pages ,
}
def get_page_layouts( input_file_path , total_pages ):
generator = extract_pages( input_file_path )
print( "Scanning Page Layouts" )
page_layouts = []
for page_layout in tqdm( generator , total=total_pages ):
page_layouts.append( page_layout )
return page_layouts
def get_lt_figures( page_layout ):
lt_figures = []
for index , element in enumerate( page_layout ):
class_name = type(element).__name__
if class_name == "LTTextBoxHorizontal":
pass
elif class_name == "LTRect":
pass
elif class_name == "LTFigure":
lt_figures.append( element )
elif class_name == "LTCurve":
pass
elif class_name == "LTLine":
pass
else:
print( class_name )
return lt_figures
def get_image_type( stream_first_4_bytes ):
file_type = False
bytes_as_hex = b2a_hex( stream_first_4_bytes ).decode()
if bytes_as_hex.startswith( 'ffd8' ):
file_type = 'jpeg'
elif bytes_as_hex == '89504e47':
file_type = 'png'
elif bytes_as_hex == '47494638':
file_type = 'gif'
elif bytes_as_hex.startswith('424d'):
file_type = 'bmp'
else:
print( bytes_as_hex )
return file_type
# def get_image_info_from_stream_raw_data( stream ):
# try:
# image = Image.open( io.BytesIO( element._objs[ 0 ].stream.rawdata ) )
# return {
# "type": get_image_type( element._objs[ 0 ].stream.rawdata[ 0 : 4 ] ) ,
# "width": element._objs[ 0 ].width ,
# "height": element._objs[ 0 ].height ,
# "bounding_box": {
# 'x1': element._objs[ 0 ].x0 , 'y1': element._objs[ 0 ].y0 ,
# 'x2': element._objs[ 0 ].x1 , 'y2': element._objs[ 0 ].y1 ,
# } ,
# "pil_image": image
# }
# except Exception as e:
# print( e )
# return False
def get_image_info( element ):
try:
# print( element._objs[ 0 ].stream.rawdata )
image = Image.open( io.BytesIO( element._objs[ 0 ].stream.rawdata ) )
return {
"type": get_image_type( element._objs[ 0 ].stream.rawdata[ 0 : 4 ] ) ,
"width": element._objs[ 0 ].width ,
"height": element._objs[ 0 ].height ,
"bounding_box": {
'x1': element._objs[ 0 ].x0 , 'y1': element._objs[ 0 ].y0 ,
'x2': element._objs[ 0 ].x1 , 'y2': element._objs[ 0 ].y1 ,
} ,
"pil_image": image
}
except Exception as e:
# print( "Failed to Parse Raw Data" )
# print( e )
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
# pillow_modes = [ "1" , "L" , "P" , "RGB" , "RGBA" , "CMYK" , "YCbCr" , "LAB" , "HSV" , "I" , "F" , "LA" , "PA" ]
# pillow_extra_modes = [ "LA" , "PA" , "RGBX" , "RGBa" , "La" , "I;16" , "I;16L" , "I;16B" , "I;16N" , "BGR;15" , "BGR;16" , "BGR;24" , "BGR;32" ]
#pillow_modes.extend( pillow_extra_modes )
pillow_modes = [ "RGB" ]
candidate_images = []
try:
# image_bytes = io.BytesIO( element._objs[ 0 ].stream.rawdata )
# image = Image.frombytes( '1' , image_object.srcsize, image_object.stream.get_data(), 'raw')
# with open(image_object.name + '.bmp', 'wb') as fp:
# image.save(fp)
# print( vars( element._objs[ 0 ] ) )
# color_bits = element._objs[ 0 ].bits
# image_type = "png"
# print( vars( element._objs[ 0 ].colorspace[0] ) )
# print( vars( element._objs[ 0 ].colorspace[0].doc ) )
# https://github.com/pdfminer/pdfminer.six/issues/144
# https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=frombytes#PIL.Image.frombytes
for index , pillow_mode in enumerate( pillow_modes ):
try:
if "srcsize" not in element._objs[ 0 ]:
source_size = ( element._objs[ 0 ].width , element._objs[ 0 ].height )
else:
source_size = element._objs[ 0 ].srcsize
if "stream" not in element._objs[ 0 ]:
#print( "here ?????" )
#print( vars( element._objs[ 0 ]._objs[ 0 ] ) )
image = Image.open( io.BytesIO( element._objs[ 0 ]._objs[ 0 ].stream.rawdata ) )
return {
"type": get_image_type( element._objs[ 0 ]._objs[ 0 ].stream.rawdata[ 0 : 4 ] ) ,
"width": element._objs[ 0 ].width ,
"height": element._objs[ 0 ].height ,
"bounding_box": {
'x1': element._objs[ 0 ].x0 , 'y1': element._objs[ 0 ].y0 ,
'x2': element._objs[ 0 ].x1 , 'y2': element._objs[ 0 ].y1 ,
} ,
"pil_image": image
}
else:
figure_bytes = element._objs[ 0 ].stream.get_data()
image = Image.frombytes( pillow_mode , source_size , figure_bytes , 'raw' )
candidate_images.append({
"type": "png" ,
"mode": pillow_mode ,
"width": element._objs[ 0 ].width ,
"height": element._objs[ 0 ].height ,
"bounding_box": {
'x1': element._objs[ 0 ].x0 , 'y1': element._objs[ 0 ].y0 ,
'x2': element._objs[ 0 ].x1 , 'y2': element._objs[ 0 ].y1 ,
} ,
"pil_image": image
})
except Exception as e:
print( e )
pass
# pprint( candidate_images )
return candidate_images
except Exception as e:
#print( e )
return False
# print( vars( element._objs[ 0 ].stream ) )
# image.show()
if __name__ == "__main__":
input_file_path_posix = Path( sys.argv[ 1 ] )
output_base_path = input_file_path_posix.parent.joinpath( input_file_path_posix.stem )
shutil.rmtree( output_base_path , ignore_errors=True )
output_base_path.mkdir( parents=True , exist_ok=True )
meta_data = get_meta_data( sys.argv[ 1 ] )
page_layouts = get_page_layouts( sys.argv[ 1 ] , meta_data["total_pages"] )
for page_layout_index , page_layout in enumerate( page_layouts ):
figures = get_lt_figures( page_layout )
for figure_index , figure in enumerate( figures ):
image_info = get_image_info( figure )
if image_info is not False:
if isinstance( image_info , list ):
for image_info_index , candidate in enumerate( image_info ):
try:
candidate["image_save_path"] = output_base_path.joinpath( f"Page - {page_layout_index+1} - {figure_index+1} - {candidate['mode']}.{candidate['type']}" )
pprint( image_info )
candidate["pil_image"].save( str( candidate["image_save_path"] ) )
except Exception as e:
print( e )
else:
if image_info["type"] is not False:
image_info["image_save_path"] = output_base_path.joinpath( f"Page - {page_layout_index+1} - {figure_index+1}.{image_info['type']}" )
pprint( image_info )
image_info["pil_image"].save( str( image_info["image_save_path"] ) )
else:
pprint( image_info )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment