Dependencies:
- Guava > 19
- PDFBox > 2.0
- A helper class which wraps IOException in a RuntimeException
import java.awt.geom.Rectangle2D; | |
import org.apache.pdfbox.text.PDFTextStripperByArea; | |
public class PageRegion { | |
private String name; | |
private Rectangle2D rect; | |
// Coordinates are in the order in which the Apache PDF-Box Debugger displays them | |
public PageRegion(String name, double y1, double x1, double y2, double x2) { | |
this.name = name; | |
double x = Math.min(x1, x2); | |
double y = Math.min(y1, y2); | |
double width = Math.abs(x1 - x2) + 1; | |
double height = Math.abs(y1 - y2) + 1; | |
this.rect = new Rectangle2D.Double(x, y, width, height); | |
} | |
public String getName() { | |
return name; | |
} | |
public void addTo(PDFTextStripperByArea stripper) { | |
stripper.addRegion(name, rect); | |
} | |
} |
import static org.junit.Assert.assertEquals; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.stream.Collectors; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.text.PDFTextStripper; | |
import org.apache.pdfbox.text.PDFTextStripperByArea; | |
import com.google.common.base.Preconditions; | |
import tools.RuntimeIOException; | |
public class ReportTextExtractor { | |
private PDDocument document; | |
private File file; | |
public ReportTextExtractor(File file) { | |
this.file = file; | |
try { | |
document = PDDocument.load(file); | |
} catch (IOException e) { | |
throw new RuntimeIOException("Error loading " + file.getAbsolutePath(), e); | |
} | |
} | |
private PDFTextStripperByArea byAreaStripper; | |
protected PDFTextStripperByArea createByAreaStripper() { | |
PDFTextStripperByArea result; | |
try { | |
result = new PDFTextStripperByArea(); | |
} catch (IOException e) { | |
throw new RuntimeIOException(e); | |
} | |
result.setAddMoreFormatting(true); | |
result.setSortByPosition(true); | |
return result; | |
} | |
public ReportTextExtractor addRegion(PageRegion region) { | |
if (byAreaStripper == null) { | |
byAreaStripper = createByAreaStripper(); | |
} | |
region.addTo(byAreaStripper); | |
return this; | |
} | |
private boolean processed = false; | |
private int page = 0; | |
public String getText(PageRegion region) { | |
if (byAreaStripper == null) { | |
throw new IllegalStateException("Add some region, first!"); | |
} | |
Preconditions.checkArgument(byAreaStripper.getRegions().contains(region.getName()), | |
"Unknown region [%s]; valid names are: %s", region.getName(), byAreaStripper.getRegions()); | |
if (!processed) { | |
try { | |
byAreaStripper.extractRegions(document.getPage(page)); | |
} catch (IOException e) { | |
throw new RuntimeIOException("Error processing page " + page, e); | |
} | |
processed = true; | |
} | |
String result = byAreaStripper.getTextForRegion(region.getName()); | |
result = postProcess(result); | |
return result; | |
} | |
/** Get all the text of the document at once */ | |
public static String getText(File file) { | |
return new ReportTextExtractor(file).getText(); | |
} | |
public String getText() { | |
PDFTextStripper stripper; | |
try { | |
stripper = new PDFTextStripper(); | |
} catch (IOException e) { | |
throw new RuntimeIOException(e); | |
} | |
stripper.setAddMoreFormatting(true); | |
stripper.setSortByPosition(true); | |
try { | |
String result = stripper.getText(document); | |
result = postProcess(result); | |
return result; | |
} catch (IOException e) { | |
throw new RuntimeIOException("Error getting text from " + file.getAbsolutePath(), e); | |
} finally { | |
close(); | |
} | |
} | |
protected String postProcess(String result) { | |
return result.trim() | |
.replace("\r\n", "\n"); | |
} | |
public void mustContain(PageRegion region, String... fragments) { | |
String text = getText(region); | |
mustContain(text, fragments); | |
} | |
public void mustContain(String text, String... fragments) { | |
List<String> missing = new ArrayList<>(); | |
for(String fragment: fragments) { | |
if (!text.contains(fragment)) { | |
missing.add(fragment); | |
} | |
} | |
if (!missing.isEmpty()) { | |
String expected = missing.stream() | |
.collect(Collectors.joining("\n---\n")); | |
assertEquals(expected, text); | |
} | |
} | |
public void close() { | |
if (document != null) { | |
try { | |
document.close(); | |
} catch (IOException e) { | |
throw new RuntimeIOException(e); | |
} | |
document = null; | |
} | |
} | |
} |