JoelGeraci/PPDF_Extract_Preprocessors.java

## config.json
{
	"maxFileSize": {
		"maxFileSizeMB": 100,
		"messages": {
			"pass": "File size is acceptable.",
			"fail": "The file is is over 100MB. Too large for Extract API."
		}
	},
	"maxNumPages": {
		"maxNumPages": 400,
		"messages": {
			"pass": "The number of pages in the file is acceptable.",
			"fail": "The file is over 400 pages. Too long for Extract API."
		}
	},
	"permissions": {
		"messages": {
			"pass": "Content Extraction is allowed.",
			"fail": "Content Extraction NOT is allowed."
		}
	},
	"toUnicode": {
		"messages": {
			"pass": "No encoding issues for fonts with Identity-H encoding.",
			"fail": "The file contains fonts with Identity-H encoding that are missing ToUnicode CMaps. Extract API output may not contain usable text."
		}
	}
}

## PPDF_Extract_Preprocessors.java
/* Adobe Extract API Preprocessors by practicalPDF Inc. © 2024 by Joel Geraci is licensed under CC BY-SA 4.0  */

package com.practicalpdf.ExtractPreProcessors;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Collection;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;

public class PPDF_Extract_Preprocessors {
	public static void main(String args[]) {
		String fileName = args[0];
		try {
			JSONParser parser = new JSONParser();
			JSONObject config = (JSONObject) parser.parse(new FileReader("config.json"));
			PDDocument pdfDocument = Loader.loadPDF(new RandomAccessReadBufferedFile(fileName));

			Boolean isFileSizeAcceptable = isFileSizeAcceptable(fileName, config);
			Boolean isNumPagesAcceptable = isNumPagesAcceptable(pdfDocument, config);
			Boolean isTextExtractionAllowed = isTextExtractionAllowed(pdfDocument, config);
			Boolean hasToUnicodeProblem = hasToUnicodeProblem(pdfDocument, config);

			System.out.println("isFileSizeAcceptable: " + isFileSizeAcceptable);
			System.out.println("isNumPagesAcceptable: " + isNumPagesAcceptable);
			System.out.println("isTextExtractionAllowed: " + isTextExtractionAllowed);
			System.out.println("hasToUnicodeProblem: " + hasToUnicodeProblem);
		}
		catch(IOException e) {
			System.out.println("Error");
		} catch (ParseException e) {
			e.printStackTrace();
		}
	}

	private static Boolean isFileSizeAcceptable (String fileName, JSONObject config) {
		Boolean isFileSizeAcceptable = null;
		int fileSize = (int) Math.ceil(getFileSizeMegaBytes(new File (fileName)));
		JSONObject maxFileSize = (JSONObject) config.get("maxFileSize");
		Long maxFileSizeMB = (Long) maxFileSize.get("maxFileSizeMB");
		JSONObject messages = (JSONObject) maxFileSize.get("messages");
		String pass = (String) messages.get("pass");
		String fail = (String) messages.get("fail");

		if (fileSize <= maxFileSizeMB) {
			isFileSizeAcceptable = true;
			System.out.println(pass);
		}
		else {
			isFileSizeAcceptable = false;
			System.out.println(fail);
			System.out.println("Exiting");
			System.exit(0);
		}
		return isFileSizeAcceptable;
	}

	private static Boolean isNumPagesAcceptable (PDDocument pdfDocument, JSONObject config) {
		Boolean isNumPagesAcceptable = null;
		JSONObject maxNumPagesSection = (JSONObject) config.get("maxNumPages");
		Long maxNumPages = (Long) maxNumPagesSection.get("maxNumPages");
		JSONObject messages = (JSONObject) maxNumPagesSection.get("messages");
		String pass = (String) messages.get("pass");
		String fail = (String) messages.get("fail");

		int numPages = pdfDocument.getNumberOfPages();
		if (numPages <= maxNumPages) {
			isNumPagesAcceptable = true;
			System.out.println(pass);
		}
		else {
			System.out.println(fail);
			isNumPagesAcceptable = false;
			System.out.println("Exiting");
			System.exit(0);
		}
		return isNumPagesAcceptable;
	}

	private static Boolean isTextExtractionAllowed (PDDocument pdfDocument, JSONObject config) {
		Boolean isTextExtractionAllowed = null;

		JSONObject permissions = (JSONObject) config.get("permissions");
		JSONObject messages = (JSONObject) permissions.get("messages");
		String pass = (String) messages.get("pass");
		String fail = (String) messages.get("fail");

		if (pdfDocument.getCurrentAccessPermission().canExtractContent() == true && pdfDocument.getCurrentAccessPermission().canExtractForAccessibility() == true) {
			isTextExtractionAllowed = true;
			System.out.println(pass);
		}
		else {
			isTextExtractionAllowed = false;
			System.out.println(fail);
			System.out.println("Exiting");
			System.exit(0);
		}
		return isTextExtractionAllowed;
	}

	private static Boolean hasToUnicodeProblem(PDDocument pdfDocument, JSONObject config) {
		Boolean hasToUnicodeProblem = null;

		JSONObject toUnicode = (JSONObject) config.get("toUnicode");
		JSONObject messages = (JSONObject) toUnicode.get("messages");
		String pass = (String) messages.get("pass");
		String fail = (String) messages.get("fail");

		for (int i = 0; i < pdfDocument.getNumberOfPages(); ++i) {
			PDPage page = pdfDocument.getPage(i);
			PDResources res = page.getResources();
			if (((Collection<COSName>) res.getFontNames()).isEmpty()) {
				for (COSName xObjectName : res.getXObjectNames()) {
					PDFormXObject pdXObject;
					try {
						pdXObject = (PDFormXObject) res.getXObject(xObjectName);
						hasToUnicodeProblem = detectToUnicodeProblem(pdXObject.getResources());
						if (hasToUnicodeProblem == true) {
							break;
						}
					} catch (IOException e) {
						e.printStackTrace();
					}

				}
			}
			else {
				hasToUnicodeProblem = detectToUnicodeProblem(res);
				if (hasToUnicodeProblem == true) {
					break;
				}
			}
		}

		if (hasToUnicodeProblem == true) {
			System.out.println(fail);
			System.out.println("Exiting");
			System.exit(0);
		}
		else {
			System.out.println(pass);
		}
		return hasToUnicodeProblem;
	}


	private static Boolean detectToUnicodeProblem(PDResources pdResources) {
		Boolean fontEncodingProblems = false;
		try {
			for (COSName fontName : pdResources.getFontNames()) {
				PDFont font = pdResources.getFont(fontName);
				COSName encoding = font.getCOSObject().getCOSName(COSName.ENCODING);
				if (encoding != null && encoding.getName() == "Identity-H") {
					COSStream toUnicode = font.getCOSObject().getCOSStream(COSName.TO_UNICODE);
					if (toUnicode == null) {
						fontEncodingProblems = true;
						break;
					}
				}
			}
		}
		catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return fontEncodingProblems;
	}

	private static double getFileSizeMegaBytes(File file) {
		return (double) file.length() / (1024 * 1024);
	}
}
	{
	"maxFileSize": {
	"maxFileSizeMB": 100,
	"messages": {
	"pass": "File size is acceptable.",
	"fail": "The file is is over 100MB. Too large for Extract API."
	}
	},
	"maxNumPages": {
	"maxNumPages": 400,
	"messages": {
	"pass": "The number of pages in the file is acceptable.",
	"fail": "The file is over 400 pages. Too long for Extract API."
	}
	},
	"permissions": {
	"messages": {
	"pass": "Content Extraction is allowed.",
	"fail": "Content Extraction NOT is allowed."
	}
	},
	"toUnicode": {
	"messages": {
	"pass": "No encoding issues for fonts with Identity-H encoding.",
	"fail": "The file contains fonts with Identity-H encoding that are missing ToUnicode CMaps. Extract API output may not contain usable text."
	}
	}
	}
	/* Adobe Extract API Preprocessors by practicalPDF Inc. © 2024 by Joel Geraci is licensed under CC BY-SA 4.0 */

	package com.practicalpdf.ExtractPreProcessors;

	import java.io.File;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.Collection;
	import org.apache.pdfbox.Loader;
	import org.apache.pdfbox.cos.COSName;
	import org.apache.pdfbox.cos.COSStream;
	import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
	import org.apache.pdfbox.pdmodel.PDDocument;
	import org.apache.pdfbox.pdmodel.PDPage;
	import org.apache.pdfbox.pdmodel.PDResources;
	import org.apache.pdfbox.pdmodel.font.PDFont;
	import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
	import org.json.simple.JSONObject;
	import org.json.simple.parser.JSONParser;
	import org.json.simple.parser.ParseException;

	public class PPDF_Extract_Preprocessors {
	public static void main(String args[]) {
	String fileName = args[0];
	try {
	JSONParser parser = new JSONParser();
	JSONObject config = (JSONObject) parser.parse(new FileReader("config.json"));
	PDDocument pdfDocument = Loader.loadPDF(new RandomAccessReadBufferedFile(fileName));

	Boolean isFileSizeAcceptable = isFileSizeAcceptable(fileName, config);
	Boolean isNumPagesAcceptable = isNumPagesAcceptable(pdfDocument, config);
	Boolean isTextExtractionAllowed = isTextExtractionAllowed(pdfDocument, config);
	Boolean hasToUnicodeProblem = hasToUnicodeProblem(pdfDocument, config);

	System.out.println("isFileSizeAcceptable: " + isFileSizeAcceptable);
	System.out.println("isNumPagesAcceptable: " + isNumPagesAcceptable);
	System.out.println("isTextExtractionAllowed: " + isTextExtractionAllowed);
	System.out.println("hasToUnicodeProblem: " + hasToUnicodeProblem);
	}
	catch(IOException e) {
	System.out.println("Error");
	} catch (ParseException e) {
	e.printStackTrace();
	}
	}

	private static Boolean isFileSizeAcceptable (String fileName, JSONObject config) {
	Boolean isFileSizeAcceptable = null;
	int fileSize = (int) Math.ceil(getFileSizeMegaBytes(new File (fileName)));
	JSONObject maxFileSize = (JSONObject) config.get("maxFileSize");
	Long maxFileSizeMB = (Long) maxFileSize.get("maxFileSizeMB");
	JSONObject messages = (JSONObject) maxFileSize.get("messages");
	String pass = (String) messages.get("pass");
	String fail = (String) messages.get("fail");

	if (fileSize <= maxFileSizeMB) {
	isFileSizeAcceptable = true;
	System.out.println(pass);
	}
	else {
	isFileSizeAcceptable = false;
	System.out.println(fail);
	System.out.println("Exiting");
	System.exit(0);
	}
	return isFileSizeAcceptable;
	}

	private static Boolean isNumPagesAcceptable (PDDocument pdfDocument, JSONObject config) {
	Boolean isNumPagesAcceptable = null;
	JSONObject maxNumPagesSection = (JSONObject) config.get("maxNumPages");
	Long maxNumPages = (Long) maxNumPagesSection.get("maxNumPages");
	JSONObject messages = (JSONObject) maxNumPagesSection.get("messages");
	String pass = (String) messages.get("pass");
	String fail = (String) messages.get("fail");

	int numPages = pdfDocument.getNumberOfPages();
	if (numPages <= maxNumPages) {
	isNumPagesAcceptable = true;
	System.out.println(pass);
	}
	else {
	System.out.println(fail);
	isNumPagesAcceptable = false;
	System.out.println("Exiting");
	System.exit(0);
	}
	return isNumPagesAcceptable;
	}

	private static Boolean isTextExtractionAllowed (PDDocument pdfDocument, JSONObject config) {
	Boolean isTextExtractionAllowed = null;

	JSONObject permissions = (JSONObject) config.get("permissions");
	JSONObject messages = (JSONObject) permissions.get("messages");
	String pass = (String) messages.get("pass");
	String fail = (String) messages.get("fail");

	if (pdfDocument.getCurrentAccessPermission().canExtractContent() == true && pdfDocument.getCurrentAccessPermission().canExtractForAccessibility() == true) {
	isTextExtractionAllowed = true;
	System.out.println(pass);
	}
	else {
	isTextExtractionAllowed = false;
	System.out.println(fail);
	System.out.println("Exiting");
	System.exit(0);
	}
	return isTextExtractionAllowed;
	}

	private static Boolean hasToUnicodeProblem(PDDocument pdfDocument, JSONObject config) {
	Boolean hasToUnicodeProblem = null;

	JSONObject toUnicode = (JSONObject) config.get("toUnicode");
	JSONObject messages = (JSONObject) toUnicode.get("messages");
	String pass = (String) messages.get("pass");
	String fail = (String) messages.get("fail");

	for (int i = 0; i < pdfDocument.getNumberOfPages(); ++i) {
	PDPage page = pdfDocument.getPage(i);
	PDResources res = page.getResources();
	if (((Collection<COSName>) res.getFontNames()).isEmpty()) {
	for (COSName xObjectName : res.getXObjectNames()) {
	PDFormXObject pdXObject;
	try {
	pdXObject = (PDFormXObject) res.getXObject(xObjectName);
	hasToUnicodeProblem = detectToUnicodeProblem(pdXObject.getResources());
	if (hasToUnicodeProblem == true) {
	break;
	}
	} catch (IOException e) {
	e.printStackTrace();
	}

	}
	}
	else {
	hasToUnicodeProblem = detectToUnicodeProblem(res);
	if (hasToUnicodeProblem == true) {
	break;
	}
	}
	}

	if (hasToUnicodeProblem == true) {
	System.out.println(fail);
	System.out.println("Exiting");
	System.exit(0);
	}
	else {
	System.out.println(pass);
	}
	return hasToUnicodeProblem;
	}


	private static Boolean detectToUnicodeProblem(PDResources pdResources) {
	Boolean fontEncodingProblems = false;
	try {
	for (COSName fontName : pdResources.getFontNames()) {
	PDFont font = pdResources.getFont(fontName);
	COSName encoding = font.getCOSObject().getCOSName(COSName.ENCODING);
	if (encoding != null && encoding.getName() == "Identity-H") {
	COSStream toUnicode = font.getCOSObject().getCOSStream(COSName.TO_UNICODE);
	if (toUnicode == null) {
	fontEncodingProblems = true;
	break;
	}
	}
	}
	}
	catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	return fontEncodingProblems;
	}

	private static double getFileSizeMegaBytes(File file) {
	return (double) file.length() / (1024 * 1024);
	}
	}