Skip to content

Instantly share code, notes, and snippets.

@JoelGeraci
Last active March 26, 2024 19:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JoelGeraci/76a2c2bf13adfc1542c95e3ba3f597a2 to your computer and use it in GitHub Desktop.
Save JoelGeraci/76a2c2bf13adfc1542c95e3ba3f597a2 to your computer and use it in GitHub Desktop.
This Gist will determine if a PDF file will produce usable results from the Adobe Extract API
{
"maxFileSize": {
"maxFileSizeMB": 100,
"messages": {
"pass": "File size is acceptable.",
"fail": "The file is is over 100MB. Too large for Extract API."
}
},
"maxNumPages": {
"maxNumPages": 400,
"messages": {
"pass": "The number of pages in the file is acceptable.",
"fail": "The file is over 400 pages. Too long for Extract API."
}
},
"permissions": {
"messages": {
"pass": "Content Extraction is allowed.",
"fail": "Content Extraction NOT is allowed."
}
},
"toUnicode": {
"messages": {
"pass": "No encoding issues for fonts with Identity-H encoding.",
"fail": "The file contains fonts with Identity-H encoding that are missing ToUnicode CMaps. Extract API output may not contain usable text."
}
}
}
/* Adobe Extract API Preprocessors by practicalPDF Inc. © 2024 by Joel Geraci is licensed under CC BY-SA 4.0 */
package com.practicalpdf.ExtractPreProcessors;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Collection;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
public class PPDF_Extract_Preprocessors {
public static void main(String args[]) {
String fileName = args[0];
try {
JSONParser parser = new JSONParser();
JSONObject config = (JSONObject) parser.parse(new FileReader("config.json"));
PDDocument pdfDocument = Loader.loadPDF(new RandomAccessReadBufferedFile(fileName));
Boolean isFileSizeAcceptable = isFileSizeAcceptable(fileName, config);
Boolean isNumPagesAcceptable = isNumPagesAcceptable(pdfDocument, config);
Boolean isTextExtractionAllowed = isTextExtractionAllowed(pdfDocument, config);
Boolean hasToUnicodeProblem = hasToUnicodeProblem(pdfDocument, config);
System.out.println("isFileSizeAcceptable: " + isFileSizeAcceptable);
System.out.println("isNumPagesAcceptable: " + isNumPagesAcceptable);
System.out.println("isTextExtractionAllowed: " + isTextExtractionAllowed);
System.out.println("hasToUnicodeProblem: " + hasToUnicodeProblem);
}
catch(IOException e) {
System.out.println("Error");
} catch (ParseException e) {
e.printStackTrace();
}
}
private static Boolean isFileSizeAcceptable (String fileName, JSONObject config) {
Boolean isFileSizeAcceptable = null;
int fileSize = (int) Math.ceil(getFileSizeMegaBytes(new File (fileName)));
JSONObject maxFileSize = (JSONObject) config.get("maxFileSize");
Long maxFileSizeMB = (Long) maxFileSize.get("maxFileSizeMB");
JSONObject messages = (JSONObject) maxFileSize.get("messages");
String pass = (String) messages.get("pass");
String fail = (String) messages.get("fail");
if (fileSize <= maxFileSizeMB) {
isFileSizeAcceptable = true;
System.out.println(pass);
}
else {
isFileSizeAcceptable = false;
System.out.println(fail);
System.out.println("Exiting");
System.exit(0);
}
return isFileSizeAcceptable;
}
private static Boolean isNumPagesAcceptable (PDDocument pdfDocument, JSONObject config) {
Boolean isNumPagesAcceptable = null;
JSONObject maxNumPagesSection = (JSONObject) config.get("maxNumPages");
Long maxNumPages = (Long) maxNumPagesSection.get("maxNumPages");
JSONObject messages = (JSONObject) maxNumPagesSection.get("messages");
String pass = (String) messages.get("pass");
String fail = (String) messages.get("fail");
int numPages = pdfDocument.getNumberOfPages();
if (numPages <= maxNumPages) {
isNumPagesAcceptable = true;
System.out.println(pass);
}
else {
System.out.println(fail);
isNumPagesAcceptable = false;
System.out.println("Exiting");
System.exit(0);
}
return isNumPagesAcceptable;
}
private static Boolean isTextExtractionAllowed (PDDocument pdfDocument, JSONObject config) {
Boolean isTextExtractionAllowed = null;
JSONObject permissions = (JSONObject) config.get("permissions");
JSONObject messages = (JSONObject) permissions.get("messages");
String pass = (String) messages.get("pass");
String fail = (String) messages.get("fail");
if (pdfDocument.getCurrentAccessPermission().canExtractContent() == true && pdfDocument.getCurrentAccessPermission().canExtractForAccessibility() == true) {
isTextExtractionAllowed = true;
System.out.println(pass);
}
else {
isTextExtractionAllowed = false;
System.out.println(fail);
System.out.println("Exiting");
System.exit(0);
}
return isTextExtractionAllowed;
}
private static Boolean hasToUnicodeProblem(PDDocument pdfDocument, JSONObject config) {
Boolean hasToUnicodeProblem = null;
JSONObject toUnicode = (JSONObject) config.get("toUnicode");
JSONObject messages = (JSONObject) toUnicode.get("messages");
String pass = (String) messages.get("pass");
String fail = (String) messages.get("fail");
for (int i = 0; i < pdfDocument.getNumberOfPages(); ++i) {
PDPage page = pdfDocument.getPage(i);
PDResources res = page.getResources();
if (((Collection<COSName>) res.getFontNames()).isEmpty()) {
for (COSName xObjectName : res.getXObjectNames()) {
PDFormXObject pdXObject;
try {
pdXObject = (PDFormXObject) res.getXObject(xObjectName);
hasToUnicodeProblem = detectToUnicodeProblem(pdXObject.getResources());
if (hasToUnicodeProblem == true) {
break;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
else {
hasToUnicodeProblem = detectToUnicodeProblem(res);
if (hasToUnicodeProblem == true) {
break;
}
}
}
if (hasToUnicodeProblem == true) {
System.out.println(fail);
System.out.println("Exiting");
System.exit(0);
}
else {
System.out.println(pass);
}
return hasToUnicodeProblem;
}
private static Boolean detectToUnicodeProblem(PDResources pdResources) {
Boolean fontEncodingProblems = false;
try {
for (COSName fontName : pdResources.getFontNames()) {
PDFont font = pdResources.getFont(fontName);
COSName encoding = font.getCOSObject().getCOSName(COSName.ENCODING);
if (encoding != null && encoding.getName() == "Identity-H") {
COSStream toUnicode = font.getCOSObject().getCOSStream(COSName.TO_UNICODE);
if (toUnicode == null) {
fontEncodingProblems = true;
break;
}
}
}
}
catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return fontEncodingProblems;
}
private static double getFileSizeMegaBytes(File file) {
return (double) file.length() / (1024 * 1024);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment