Skip to content

Instantly share code, notes, and snippets.

@DavidYKay
Last active July 12, 2021 09:55
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save DavidYKay/82f20ba67c50c499ebb3 to your computer and use it in GitHub Desktop.
Save DavidYKay/82f20ba67c50c499ebb3 to your computer and use it in GitHub Desktop.
Example of using Java & PDFBox to extract text positions from a PDF: https://jackson-brain.com/using-pdfbox-to-locate-text-coordinates-within-a-pdf-in-java/
package printtextlocations;
import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
public class PrintTextLocations extends PDFTextStripper {
public static StringBuilder tWord = new StringBuilder();
public static String seek;
public static String[] seekA;
public static List wordList = new ArrayList();
public static boolean is1stChar = true;
public static boolean lineMatch;
public static int pageNo = 1;
public static double lastYVal;
public PrintTextLocations()
throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args)
throws Exception {
PDDocument document = null;
seekA = args[1].split(",");
seek = args[1];
try {
File input = new File(args[0]);
document = PDDocument.load(input);
if (document.isEncrypted()) {
try {
document.decrypt("");
} catch (InvalidPasswordException e) {
System.err.println("Error: Document is encrypted with a password.");
System.exit(1);
}
}
PrintTextLocations printer = new PrintTextLocations();
List allPages = document.getDocumentCatalog().getAllPages();
for (int i = 0; i < allPages.size(); i++) {
PDPage page = (PDPage) allPages.get(i);
PDStream contents = page.getContents();
if (contents != null) {
printer.processStream(page, page.findResources(), page.getContents().getStream());
}
pageNo += 1;
}
} finally {
if (document != null) {
System.out.println(wordList);
document.close();
}
}
}
@Override
protected void processTextPosition(TextPosition text) {
String tChar = text.getCharacter();
System.out.println("String[" + text.getXDirAdj() + ","
+ text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
+ text.getXScale() + " height=" + text.getHeightDir() + " space="
+ text.getWidthOfSpace() + " width="
+ text.getWidthDirAdj() + "]" + text.getCharacter());
String REGEX = "[,.\\[\\](:;!?)/]";
char c = tChar.charAt(0);
lineMatch = matchCharLine(text);
if ((!tChar.matches(REGEX)) && (!Character.isWhitespace(c))) {
if ((!is1stChar) && (lineMatch == true)) {
appendChar(tChar);
} else if (is1stChar == true) {
setWordCoord(text, tChar);
}
} else {
endWord();
}
}
protected void appendChar(String tChar) {
tWord.append(tChar);
is1stChar = false;
}
protected void setWordCoord(TextPosition text, String tChar) {
tWord.append("(").append(pageNo).append(")[").append(roundVal(Float.valueOf(text.getXDirAdj()))).append(" : ").append(roundVal(Float.valueOf(text.getYDirAdj()))).append("] ").append(tChar);
is1stChar = false;
}
protected void endWord() {
String newWord = tWord.toString().replaceAll("[^\\x00-\\x7F]", "");
String sWord = newWord.substring(newWord.lastIndexOf(' ') + 1);
if (!"".equals(sWord)) {
if (Arrays.asList(seekA).contains(sWord)) {
wordList.add(newWord);
} else if ("SHOWMETHEMONEY".equals(seek)) {
wordList.add(newWord);
}
}
tWord.delete(0, tWord.length());
is1stChar = true;
}
protected boolean matchCharLine(TextPosition text) {
Double yVal = roundVal(Float.valueOf(text.getYDirAdj()));
if (yVal.doubleValue() == lastYVal) {
return true;
}
lastYVal = yVal.doubleValue();
endWord();
return false;
}
protected Double roundVal(Float yVal) {
DecimalFormat rounded = new DecimalFormat("0.0'0'");
Double yValDub = new Double(rounded.format(yVal));
return yValDub;
}
}
@zygimantus
Copy link

Worth mentioning, that this code uses PDFBox version 1.8.4, while the current is 2.0.4.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment