Last active
March 26, 2024 19:04
-
-
Save JoelGeraci/76a2c2bf13adfc1542c95e3ba3f597a2 to your computer and use it in GitHub Desktop.
This Gist will determine if a PDF file will produce usable results from the Adobe Extract API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"maxFileSize": { | |
"maxFileSizeMB": 100, | |
"messages": { | |
"pass": "File size is acceptable.", | |
"fail": "The file is is over 100MB. Too large for Extract API." | |
} | |
}, | |
"maxNumPages": { | |
"maxNumPages": 400, | |
"messages": { | |
"pass": "The number of pages in the file is acceptable.", | |
"fail": "The file is over 400 pages. Too long for Extract API." | |
} | |
}, | |
"permissions": { | |
"messages": { | |
"pass": "Content Extraction is allowed.", | |
"fail": "Content Extraction NOT is allowed." | |
} | |
}, | |
"toUnicode": { | |
"messages": { | |
"pass": "No encoding issues for fonts with Identity-H encoding.", | |
"fail": "The file contains fonts with Identity-H encoding that are missing ToUnicode CMaps. Extract API output may not contain usable text." | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Adobe Extract API Preprocessors by practicalPDF Inc. © 2024 by Joel Geraci is licensed under CC BY-SA 4.0 */ | |
package com.practicalpdf.ExtractPreProcessors; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.util.Collection; | |
import org.apache.pdfbox.Loader; | |
import org.apache.pdfbox.cos.COSName; | |
import org.apache.pdfbox.cos.COSStream; | |
import org.apache.pdfbox.io.RandomAccessReadBufferedFile; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.pdmodel.PDPage; | |
import org.apache.pdfbox.pdmodel.PDResources; | |
import org.apache.pdfbox.pdmodel.font.PDFont; | |
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; | |
import org.json.simple.JSONObject; | |
import org.json.simple.parser.JSONParser; | |
import org.json.simple.parser.ParseException; | |
public class PPDF_Extract_Preprocessors { | |
public static void main(String args[]) { | |
String fileName = args[0]; | |
try { | |
JSONParser parser = new JSONParser(); | |
JSONObject config = (JSONObject) parser.parse(new FileReader("config.json")); | |
PDDocument pdfDocument = Loader.loadPDF(new RandomAccessReadBufferedFile(fileName)); | |
Boolean isFileSizeAcceptable = isFileSizeAcceptable(fileName, config); | |
Boolean isNumPagesAcceptable = isNumPagesAcceptable(pdfDocument, config); | |
Boolean isTextExtractionAllowed = isTextExtractionAllowed(pdfDocument, config); | |
Boolean hasToUnicodeProblem = hasToUnicodeProblem(pdfDocument, config); | |
System.out.println("isFileSizeAcceptable: " + isFileSizeAcceptable); | |
System.out.println("isNumPagesAcceptable: " + isNumPagesAcceptable); | |
System.out.println("isTextExtractionAllowed: " + isTextExtractionAllowed); | |
System.out.println("hasToUnicodeProblem: " + hasToUnicodeProblem); | |
} | |
catch(IOException e) { | |
System.out.println("Error"); | |
} catch (ParseException e) { | |
e.printStackTrace(); | |
} | |
} | |
private static Boolean isFileSizeAcceptable (String fileName, JSONObject config) { | |
Boolean isFileSizeAcceptable = null; | |
int fileSize = (int) Math.ceil(getFileSizeMegaBytes(new File (fileName))); | |
JSONObject maxFileSize = (JSONObject) config.get("maxFileSize"); | |
Long maxFileSizeMB = (Long) maxFileSize.get("maxFileSizeMB"); | |
JSONObject messages = (JSONObject) maxFileSize.get("messages"); | |
String pass = (String) messages.get("pass"); | |
String fail = (String) messages.get("fail"); | |
if (fileSize <= maxFileSizeMB) { | |
isFileSizeAcceptable = true; | |
System.out.println(pass); | |
} | |
else { | |
isFileSizeAcceptable = false; | |
System.out.println(fail); | |
System.out.println("Exiting"); | |
System.exit(0); | |
} | |
return isFileSizeAcceptable; | |
} | |
private static Boolean isNumPagesAcceptable (PDDocument pdfDocument, JSONObject config) { | |
Boolean isNumPagesAcceptable = null; | |
JSONObject maxNumPagesSection = (JSONObject) config.get("maxNumPages"); | |
Long maxNumPages = (Long) maxNumPagesSection.get("maxNumPages"); | |
JSONObject messages = (JSONObject) maxNumPagesSection.get("messages"); | |
String pass = (String) messages.get("pass"); | |
String fail = (String) messages.get("fail"); | |
int numPages = pdfDocument.getNumberOfPages(); | |
if (numPages <= maxNumPages) { | |
isNumPagesAcceptable = true; | |
System.out.println(pass); | |
} | |
else { | |
System.out.println(fail); | |
isNumPagesAcceptable = false; | |
System.out.println("Exiting"); | |
System.exit(0); | |
} | |
return isNumPagesAcceptable; | |
} | |
private static Boolean isTextExtractionAllowed (PDDocument pdfDocument, JSONObject config) { | |
Boolean isTextExtractionAllowed = null; | |
JSONObject permissions = (JSONObject) config.get("permissions"); | |
JSONObject messages = (JSONObject) permissions.get("messages"); | |
String pass = (String) messages.get("pass"); | |
String fail = (String) messages.get("fail"); | |
if (pdfDocument.getCurrentAccessPermission().canExtractContent() == true && pdfDocument.getCurrentAccessPermission().canExtractForAccessibility() == true) { | |
isTextExtractionAllowed = true; | |
System.out.println(pass); | |
} | |
else { | |
isTextExtractionAllowed = false; | |
System.out.println(fail); | |
System.out.println("Exiting"); | |
System.exit(0); | |
} | |
return isTextExtractionAllowed; | |
} | |
private static Boolean hasToUnicodeProblem(PDDocument pdfDocument, JSONObject config) { | |
Boolean hasToUnicodeProblem = null; | |
JSONObject toUnicode = (JSONObject) config.get("toUnicode"); | |
JSONObject messages = (JSONObject) toUnicode.get("messages"); | |
String pass = (String) messages.get("pass"); | |
String fail = (String) messages.get("fail"); | |
for (int i = 0; i < pdfDocument.getNumberOfPages(); ++i) { | |
PDPage page = pdfDocument.getPage(i); | |
PDResources res = page.getResources(); | |
if (((Collection<COSName>) res.getFontNames()).isEmpty()) { | |
for (COSName xObjectName : res.getXObjectNames()) { | |
PDFormXObject pdXObject; | |
try { | |
pdXObject = (PDFormXObject) res.getXObject(xObjectName); | |
hasToUnicodeProblem = detectToUnicodeProblem(pdXObject.getResources()); | |
if (hasToUnicodeProblem == true) { | |
break; | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
else { | |
hasToUnicodeProblem = detectToUnicodeProblem(res); | |
if (hasToUnicodeProblem == true) { | |
break; | |
} | |
} | |
} | |
if (hasToUnicodeProblem == true) { | |
System.out.println(fail); | |
System.out.println("Exiting"); | |
System.exit(0); | |
} | |
else { | |
System.out.println(pass); | |
} | |
return hasToUnicodeProblem; | |
} | |
private static Boolean detectToUnicodeProblem(PDResources pdResources) { | |
Boolean fontEncodingProblems = false; | |
try { | |
for (COSName fontName : pdResources.getFontNames()) { | |
PDFont font = pdResources.getFont(fontName); | |
COSName encoding = font.getCOSObject().getCOSName(COSName.ENCODING); | |
if (encoding != null && encoding.getName() == "Identity-H") { | |
COSStream toUnicode = font.getCOSObject().getCOSStream(COSName.TO_UNICODE); | |
if (toUnicode == null) { | |
fontEncodingProblems = true; | |
break; | |
} | |
} | |
} | |
} | |
catch (IOException e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
return fontEncodingProblems; | |
} | |
private static double getFileSizeMegaBytes(File file) { | |
return (double) file.length() / (1024 * 1024); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment