Skip to content

Instantly share code, notes, and snippets.

@digulla

digulla/PageRegion

Last active Jan 25, 2017
Embed
What would you like to do?
Support code to write unit tests that validate PDF documents

Dependencies:

  • Guava > 19
  • PDFBox > 2.0
  • A helper class which wraps IOException in a RuntimeException
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.text.PDFTextStripperByArea;
public class PageRegion {
private String name;
private Rectangle2D rect;
// Coordinates are in the order in which the Apache PDF-Box Debugger displays them
public PageRegion(String name, double y1, double x1, double y2, double x2) {
this.name = name;
double x = Math.min(x1, x2);
double y = Math.min(y1, y2);
double width = Math.abs(x1 - x2) + 1;
double height = Math.abs(y1 - y2) + 1;
this.rect = new Rectangle2D.Double(x, y, width, height);
}
public String getName() {
return name;
}
public void addTo(PDFTextStripperByArea stripper) {
stripper.addRegion(name, rect);
}
}
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import com.google.common.base.Preconditions;
import tools.RuntimeIOException;
public class ReportTextExtractor {
private PDDocument document;
private File file;
public ReportTextExtractor(File file) {
this.file = file;
try {
document = PDDocument.load(file);
} catch (IOException e) {
throw new RuntimeIOException("Error loading " + file.getAbsolutePath(), e);
}
}
private PDFTextStripperByArea byAreaStripper;
protected PDFTextStripperByArea createByAreaStripper() {
PDFTextStripperByArea result;
try {
result = new PDFTextStripperByArea();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
result.setAddMoreFormatting(true);
result.setSortByPosition(true);
return result;
}
public ReportTextExtractor addRegion(PageRegion region) {
if (byAreaStripper == null) {
byAreaStripper = createByAreaStripper();
}
region.addTo(byAreaStripper);
return this;
}
private boolean processed = false;
private int page = 0;
public String getText(PageRegion region) {
if (byAreaStripper == null) {
throw new IllegalStateException("Add some region, first!");
}
Preconditions.checkArgument(byAreaStripper.getRegions().contains(region.getName()),
"Unknown region [%s]; valid names are: %s", region.getName(), byAreaStripper.getRegions());
if (!processed) {
try {
byAreaStripper.extractRegions(document.getPage(page));
} catch (IOException e) {
throw new RuntimeIOException("Error processing page " + page, e);
}
processed = true;
}
String result = byAreaStripper.getTextForRegion(region.getName());
result = postProcess(result);
return result;
}
/** Get all the text of the document at once */
public static String getText(File file) {
return new ReportTextExtractor(file).getText();
}
public String getText() {
PDFTextStripper stripper;
try {
stripper = new PDFTextStripper();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
stripper.setAddMoreFormatting(true);
stripper.setSortByPosition(true);
try {
String result = stripper.getText(document);
result = postProcess(result);
return result;
} catch (IOException e) {
throw new RuntimeIOException("Error getting text from " + file.getAbsolutePath(), e);
} finally {
close();
}
}
protected String postProcess(String result) {
return result.trim()
.replace("\r\n", "\n");
}
public void mustContain(PageRegion region, String... fragments) {
String text = getText(region);
mustContain(text, fragments);
}
public void mustContain(String text, String... fragments) {
List<String> missing = new ArrayList<>();
for(String fragment: fragments) {
if (!text.contains(fragment)) {
missing.add(fragment);
}
}
if (!missing.isEmpty()) {
String expected = missing.stream()
.collect(Collectors.joining("\n---\n"));
assertEquals(expected, text);
}
}
public void close() {
if (document != null) {
try {
document.close();
} catch (IOException e) {
throw new RuntimeIOException(e);
}
document = null;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment