XBigTK13X/pdf-extract-images.py

## pdf-extract-images.py
#! /usr/bin/python3

# This script requires pdfimage (poppler-utils) and convert (imagemagick)

# Raw images will be written to <OUTPUT_DIR>/15-organized
# Attempts at merging masks and images will be output to <OUTPUT_DIR/30-masked>
# A sample of one image using all compose methods will be written to <OUTPUT_DIR>/25-samples

# Rewritten from https://gist.github.com/bendavis78/ed22a974c2b4534305eabb2522956359

import os
import sys
import subprocess
import shutil

QUIET = False

COMPOSITIONS = [
    "CopyOpacity",
]

def log(message):
    global QUIET
    if not QUIET:
        print(message)

if len(sys.argv) >= 6:
    QUIET = True

if len(sys.argv) < 2:
    print("An input PDF file is required")
    sys.exit(1)

if len(sys.argv) < 3:
    print("An output directory is required")
    sys.exit(1)

if len(sys.argv) < 4 or sys.argv[3] == "all":
    log("Will only attempt CopyOpacity composition")
else:
    log(f'Will attempt [{sys.argv[3]}] compositions')
    COMPOSITIONS = sys.argv[3].split(',')

SAMPLE_IMAGE_NUM = 1
if len(sys.argv) >= 5:
    log(f'Will copy samples using image [{sys.argv[4]}]')
    SAMPLE_IMAGE_NUM = int(sys.argv[4])

INPUT_PDF_FILE=sys.argv[1]
OUTPUT_DIR=sys.argv[2]

if os.path.exists(OUTPUT_DIR):
    shutil.rmtree(OUTPUT_DIR)

def execute(command):
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    result = process.returncode
    if result != 0:
        print("An error occurred while running {}".format(command))
        print("stdout: {}".format(stdout))
        print("stderr: {}".format(stderr))
        sys.exit(1)
    return {
        "result": result,
        "stdout": stdout.decode('utf-8').split('\n'),
        "stderr": stderr.decode('utf-8').split('\n')
    }

EXTRACT_DIR=os.path.join(OUTPUT_DIR,"10-extract")
if not os.path.exists(EXTRACT_DIR):
    os.makedirs(EXTRACT_DIR,exist_ok=True)

log(f"Extract image data from PDF to [{EXTRACT_DIR}]")
command = f'pdfimages -png "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"'
execute(command)

log("Gather extracted image paths")
extracted_images = {}
for root,dirs,files in os.walk(EXTRACT_DIR):
    for ff in files:
        image_num = int(ff.split('-')[1].split('.')[0])
        extracted_images[image_num] = os.path.join(root,ff)

metadata_parts = [
    'page',
    'num',
    'type',
    'width',
    'height',
    'color',
    'comp',
    'bpc',
    'enc',
    'interop',
    'object',
    'id',
    'x_ppi',
    'y_ppi',
    'size',
    'ratio'
]

class PdfImageMetadata:
    def __init__(self, text):
        global metadata_parts
        parts = text.split()
        for meta in metadata_parts:
            if len(parts) <= 0:
                break
            self.__setattr__(meta, parts.pop(0))
        self.num = int(self.num)
        self.object = int(self.object)

pdf_objects = {}
log("Parse PDF image metadata")
command = f'pdfimages -list "{INPUT_PDF_FILE}"'
list_results = execute(command)
count = 0
for line in list_results['stdout']:
    count += 1
    if count < 3:
        continue
    if len(line) <= 2:
        continue
    image = PdfImageMetadata(line)
    if not 'image' in image.type and not 'smask' in image.type:
        continue
    if not image.object in pdf_objects:
        pdf_objects[image.object] = {}
    pdf_objects[image.object][image.type] = image

MASKED_DIR = os.path.join(OUTPUT_DIR,'30-masked')
os.makedirs(MASKED_DIR, exist_ok=True)

SAMPLE_DIR = os.path.join(OUTPUT_DIR,'25-samples')
os.makedirs(SAMPLE_DIR, exist_ok=True)

ORGANIZE_DIR = os.path.join(OUTPUT_DIR,'15-organized')
os.makedirs(ORGANIZE_DIR, exist_ok=True)

RAW_MASK_DIR = os.path.join(ORGANIZE_DIR,'mask')
os.makedirs(RAW_MASK_DIR, exist_ok=True)

RAW_IMAGE_DIR =os.path.join(ORGANIZE_DIR,'image')
os.makedirs(RAW_IMAGE_DIR)

def compose(image, mask, destination, mode, prefix):
    merged_dir = os.path.join(MASKED_DIR,prefix,mode)
    os.makedirs(merged_dir,exist_ok=True)
    merged_file = f'{destination:05d}.png'
    merged_path = os.path.join(merged_dir,merged_file)
    command = f'convert "{image}" "{mask}" -compose {mode} -composite "{merged_path}"'
    execute(command)
    if destination == SAMPLE_IMAGE_NUM:
        sample_path = os.path.join(SAMPLE_DIR,f'{prefix}-{mode}-{destination:05d}.png')
        shutil.copy(merged_path,sample_path)

log("Merging masked images, copying standalone images")
merged_count = 0
standalone_count = 0
images_counted = False
mode_count = 0
for mode in COMPOSITIONS:
    mode_count += 1
    log(f"Compose images using mode ({mode_count}/{len(COMPOSITIONS)}) [{mode}]")
    for k,v in pdf_objects.items():
        if 'smask' in v and 'image' in v:
            image = extracted_images[v['image'].num]
            mask = extracted_images[v['smask'].num]
            shutil.copy(image, os.path.join(RAW_IMAGE_DIR,f"{v['image'].num}.png"))
            shutil.copy(mask, os.path.join(RAW_MASK_DIR,f"{v['smask'].num}.png"))
            compose(image,mask,v['image'].num,mode,"image+mask")
            if not images_counted:
               merged_count += 1
        elif 'image' in v:
            source = extracted_images[v['image'].num]
            shutil.copy(source,os.path.join(RAW_IMAGE_DIR,f"{v['image'].num}.png"))
    images_counted = True

log(f"Raw images sorted in [{ORGANIZE_DIR}]")
log(f"{merged_count} masked images merged in [{len(COMPOSITIONS)}] ways to [{MASKED_DIR}]")
	#! /usr/bin/python3

	# This script requires pdfimage (poppler-utils) and convert (imagemagick)

	# Raw images will be written to <OUTPUT_DIR>/15-organized
	# Attempts at merging masks and images will be output to <OUTPUT_DIR/30-masked>
	# A sample of one image using all compose methods will be written to <OUTPUT_DIR>/25-samples

	# Rewritten from https://gist.github.com/bendavis78/ed22a974c2b4534305eabb2522956359

	import os
	import sys
	import subprocess
	import shutil

	QUIET = False

	COMPOSITIONS = [
	"CopyOpacity",
	]

	def log(message):
	global QUIET
	if not QUIET:
	print(message)

	if len(sys.argv) >= 6:
	QUIET = True

	if len(sys.argv) < 2:
	print("An input PDF file is required")
	sys.exit(1)

	if len(sys.argv) < 3:
	print("An output directory is required")
	sys.exit(1)

	if len(sys.argv) < 4 or sys.argv[3] == "all":
	log("Will only attempt CopyOpacity composition")
	else:
	log(f'Will attempt [{sys.argv[3]}] compositions')
	COMPOSITIONS = sys.argv[3].split(',')

	SAMPLE_IMAGE_NUM = 1
	if len(sys.argv) >= 5:
	log(f'Will copy samples using image [{sys.argv[4]}]')
	SAMPLE_IMAGE_NUM = int(sys.argv[4])

	INPUT_PDF_FILE=sys.argv[1]
	OUTPUT_DIR=sys.argv[2]

	if os.path.exists(OUTPUT_DIR):
	shutil.rmtree(OUTPUT_DIR)

	def execute(command):
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	stdout, stderr = process.communicate()
	result = process.returncode
	if result != 0:
	print("An error occurred while running {}".format(command))
	print("stdout: {}".format(stdout))
	print("stderr: {}".format(stderr))
	sys.exit(1)
	return {
	"result": result,
	"stdout": stdout.decode('utf-8').split('\n'),
	"stderr": stderr.decode('utf-8').split('\n')
	}

	EXTRACT_DIR=os.path.join(OUTPUT_DIR,"10-extract")
	if not os.path.exists(EXTRACT_DIR):
	os.makedirs(EXTRACT_DIR,exist_ok=True)

	log(f"Extract image data from PDF to [{EXTRACT_DIR}]")
	command = f'pdfimages -png "{INPUT_PDF_FILE}" "{EXTRACT_DIR}/image"'
	execute(command)

	log("Gather extracted image paths")
	extracted_images = {}
	for root,dirs,files in os.walk(EXTRACT_DIR):
	for ff in files:
	image_num = int(ff.split('-')[1].split('.')[0])
	extracted_images[image_num] = os.path.join(root,ff)

	metadata_parts = [
	'page',
	'num',
	'type',
	'width',
	'height',
	'color',
	'comp',
	'bpc',
	'enc',
	'interop',
	'object',
	'id',
	'x_ppi',
	'y_ppi',
	'size',
	'ratio'
	]

	class PdfImageMetadata:
	def __init__(self, text):
	global metadata_parts
	parts = text.split()
	for meta in metadata_parts:
	if len(parts) <= 0:
	break
	self.__setattr__(meta, parts.pop(0))
	self.num = int(self.num)
	self.object = int(self.object)

	pdf_objects = {}
	log("Parse PDF image metadata")
	command = f'pdfimages -list "{INPUT_PDF_FILE}"'
	list_results = execute(command)
	count = 0
	for line in list_results['stdout']:
	count += 1
	if count < 3:
	continue
	if len(line) <= 2:
	continue
	image = PdfImageMetadata(line)
	if not 'image' in image.type and not 'smask' in image.type:
	continue
	if not image.object in pdf_objects:
	pdf_objects[image.object] = {}
	pdf_objects[image.object][image.type] = image

	MASKED_DIR = os.path.join(OUTPUT_DIR,'30-masked')
	os.makedirs(MASKED_DIR, exist_ok=True)

	SAMPLE_DIR = os.path.join(OUTPUT_DIR,'25-samples')
	os.makedirs(SAMPLE_DIR, exist_ok=True)

	ORGANIZE_DIR = os.path.join(OUTPUT_DIR,'15-organized')
	os.makedirs(ORGANIZE_DIR, exist_ok=True)

	RAW_MASK_DIR = os.path.join(ORGANIZE_DIR,'mask')
	os.makedirs(RAW_MASK_DIR, exist_ok=True)

	RAW_IMAGE_DIR =os.path.join(ORGANIZE_DIR,'image')
	os.makedirs(RAW_IMAGE_DIR)

	def compose(image, mask, destination, mode, prefix):
	merged_dir = os.path.join(MASKED_DIR,prefix,mode)
	os.makedirs(merged_dir,exist_ok=True)
	merged_file = f'{destination:05d}.png'
	merged_path = os.path.join(merged_dir,merged_file)
	command = f'convert "{image}" "{mask}" -compose {mode} -composite "{merged_path}"'
	execute(command)
	if destination == SAMPLE_IMAGE_NUM:
	sample_path = os.path.join(SAMPLE_DIR,f'{prefix}-{mode}-{destination:05d}.png')
	shutil.copy(merged_path,sample_path)

	log("Merging masked images, copying standalone images")
	merged_count = 0
	standalone_count = 0
	images_counted = False
	mode_count = 0
	for mode in COMPOSITIONS:
	mode_count += 1
	log(f"Compose images using mode ({mode_count}/{len(COMPOSITIONS)}) [{mode}]")
	for k,v in pdf_objects.items():
	if 'smask' in v and 'image' in v:
	image = extracted_images[v['image'].num]
	mask = extracted_images[v['smask'].num]
	shutil.copy(image, os.path.join(RAW_IMAGE_DIR,f"{v['image'].num}.png"))
	shutil.copy(mask, os.path.join(RAW_MASK_DIR,f"{v['smask'].num}.png"))
	compose(image,mask,v['image'].num,mode,"image+mask")
	if not images_counted:
	merged_count += 1
	elif 'image' in v:
	source = extracted_images[v['image'].num]
	shutil.copy(source,os.path.join(RAW_IMAGE_DIR,f"{v['image'].num}.png"))
	images_counted = True

	log(f"Raw images sorted in [{ORGANIZE_DIR}]")
	log(f"{merged_count} masked images merged in [{len(COMPOSITIONS)}] ways to [{MASKED_DIR}]")