Wolfenswan/image_processor.py

## image_processor.py
"""
Bulk image processor + pdf creator

I had to digitalize, sort and process a large amoung of archival material page by page. Unfortunately all pages only existed as single .jpgs.
I wrote this script to automate the tedious process of renaming + resizing the images, as well as putting them in a single pdf.

It will:
- process all folders in it's current directory
- create a backup of the original images
- rename all images according to the name of the directory containing them
- resize all images (atm. to dpi150 standards), maintaining ratio
- create a pdf containing all images

It is currently very much tailored to a specific job, but should need be can be made more dynamic and possibly cater to user input to some extent.

"""

import os
import shutil
from reportlab.pdfgen.canvas import Canvas
from PIL import Image, ExifTags

BACKUP_DIR_NAME = '_originale'
JPG_COMPRESSION = 80

def process_folders(root_dir, backup_dir):
	for dir, dirs, fileList in os.walk(root_dir, topdown=True):
		dirs[:] = [d for d in dirs if not d.startswith(BACKUP_DIR_NAME)] # Ignore files in _backup folder
		if dir != root_dir:
			process_files(dir, fileList)

def process_files(dir, fileList):
	dir_name = os.path.basename(dir)
	backup_dir_root = os.path.join(root_dir, BACKUP_DIR_NAME)
	backup_dir = os.path.join(backup_dir_root, dir_name)
	print(f'Accessing {dir_name}, containing {len(fileList)} files.')
	if len([file for file in fileList if file[-4:] in ['.pdf','.PDF']]) == 0:
		# Create a backup if necessary
		if not os.path.isdir(backup_dir):
			print(f'Creating backup of {dir_name}')
			shutil.copytree(dir, backup_dir)

		# Create basic pdf file
		pdf_path = os.path.join(dir,f'{dir_name}.pdf')
		pdf = Canvas(pdf_path) # pageCompression =

		for i,fname in enumerate(fileList): # loop through all image files
			file_type = fname[-4:]
			file_path = os.path.join(dir,fname)
			new_file = os.path.join(dir,f'{dir_name}_{i}{file_type}')
			if os.path.isfile(new_file): # Current duplicate checking is awkward, as Python sorts files with _int suffixes as strings (_1,_10,_2 etc.). If this ever causes issues, apply proper sorting with a regex-check.
				print('ERROR: file exists')
				# TODO break?
			elif file_type in ['.jpg','.JPG','.png','.PNG']:
				print(f'Renaming{fname} to {dir_name}_{i}{file_type}')
				os.rename(file_path,new_file)
				image = scale_image(new_file)
				width, height = image.size
				pdf.setPageSize((width, height))
				pdf.drawImage(new_file, 0, 0, width, height,preserveAspectRatio=True)
				pdf.showPage()
		print('Writing pdf (might take a while...)')
		pdf.save()
	else:
		print(f'PDF found, ignoring {dir_name}...')

def scale_image(image_path):
	image = Image.open(image_path)
	w, h = image.size
	size = (1754,1240) if w > h else (1240,1754) #Dpi150
	#size = (842,595) if w > h else (595,842) # Dpi72
	exif_data = {} # To properly rotate the image accessing the exif-values is required
	exif_data_raw = image._getexif()
	for tag, value in exif_data_raw.items():
		decoded_tag = ExifTags.TAGS.get(tag, tag)
		exif_data[decoded_tag] = value
	if exif_data.get('Orientation',None):
		orientation = exif_data['Orientation']
		if orientation == 6:
			image = image.rotate(270, expand=True)
		# more orientations can be added as required, see: https://www.impulseadventure.com/photo/exif-orientation.html
	image.thumbnail(size, Image.ANTIALIAS)
	image.save(image_path,optimize=True,quality=JPG_COMPRESSION)
	return image

if __name__ == '__main__':
	root_dir = os.getcwd()
	backup_dir = os.path.join(root_dir, BACKUP_DIR_NAME)
	if not os.path.isdir(backup_dir): # create backup main directory as required
		os.mkdir(backup_dir)
	process_folders(root_dir, backup_dir)
	"""
	Bulk image processor + pdf creator

	I had to digitalize, sort and process a large amoung of archival material page by page. Unfortunately all pages only existed as single .jpgs.
	I wrote this script to automate the tedious process of renaming + resizing the images, as well as putting them in a single pdf.

	It will:
	- process all folders in it's current directory
	- create a backup of the original images
	- rename all images according to the name of the directory containing them
	- resize all images (atm. to dpi150 standards), maintaining ratio
	- create a pdf containing all images

	It is currently very much tailored to a specific job, but should need be can be made more dynamic and possibly cater to user input to some extent.

	"""

	import os
	import shutil
	from reportlab.pdfgen.canvas import Canvas
	from PIL import Image, ExifTags

	BACKUP_DIR_NAME = '_originale'
	JPG_COMPRESSION = 80

	def process_folders(root_dir, backup_dir):
	for dir, dirs, fileList in os.walk(root_dir, topdown=True):
	dirs[:] = [d for d in dirs if not d.startswith(BACKUP_DIR_NAME)] # Ignore files in _backup folder
	if dir != root_dir:
	process_files(dir, fileList)

	def process_files(dir, fileList):
	dir_name = os.path.basename(dir)
	backup_dir_root = os.path.join(root_dir, BACKUP_DIR_NAME)
	backup_dir = os.path.join(backup_dir_root, dir_name)
	print(f'Accessing {dir_name}, containing {len(fileList)} files.')
	if len([file for file in fileList if file[-4:] in ['.pdf','.PDF']]) == 0:
	# Create a backup if necessary
	if not os.path.isdir(backup_dir):
	print(f'Creating backup of {dir_name}')
	shutil.copytree(dir, backup_dir)

	# Create basic pdf file
	pdf_path = os.path.join(dir,f'{dir_name}.pdf')
	pdf = Canvas(pdf_path) # pageCompression =

	for i,fname in enumerate(fileList): # loop through all image files
	file_type = fname[-4:]
	file_path = os.path.join(dir,fname)
	new_file = os.path.join(dir,f'{dir_name}_{i}{file_type}')
	if os.path.isfile(new_file): # Current duplicate checking is awkward, as Python sorts files with _int suffixes as strings (_1,_10,_2 etc.). If this ever causes issues, apply proper sorting with a regex-check.
	print('ERROR: file exists')
	# TODO break?
	elif file_type in ['.jpg','.JPG','.png','.PNG']:
	print(f'Renaming{fname} to {dir_name}_{i}{file_type}')
	os.rename(file_path,new_file)
	image = scale_image(new_file)
	width, height = image.size
	pdf.setPageSize((width, height))
	pdf.drawImage(new_file, 0, 0, width, height,preserveAspectRatio=True)
	pdf.showPage()
	print('Writing pdf (might take a while...)')
	pdf.save()
	else:
	print(f'PDF found, ignoring {dir_name}...')

	def scale_image(image_path):
	image = Image.open(image_path)
	w, h = image.size
	size = (1754,1240) if w > h else (1240,1754) #Dpi150
	#size = (842,595) if w > h else (595,842) # Dpi72
	exif_data = {} # To properly rotate the image accessing the exif-values is required
	exif_data_raw = image._getexif()
	for tag, value in exif_data_raw.items():
	decoded_tag = ExifTags.TAGS.get(tag, tag)
	exif_data[decoded_tag] = value
	if exif_data.get('Orientation',None):
	orientation = exif_data['Orientation']
	if orientation == 6:
	image = image.rotate(270, expand=True)
	# more orientations can be added as required, see: https://www.impulseadventure.com/photo/exif-orientation.html
	image.thumbnail(size, Image.ANTIALIAS)
	image.save(image_path,optimize=True,quality=JPG_COMPRESSION)
	return image

	if __name__ == '__main__':
	root_dir = os.getcwd()
	backup_dir = os.path.join(root_dir, BACKUP_DIR_NAME)
	if not os.path.isdir(backup_dir): # create backup main directory as required
	os.mkdir(backup_dir)
	process_folders(root_dir, backup_dir)