Skip to content

Instantly share code, notes, and snippets.

@aheld
Created September 24, 2013 18:41
Show Gist options
  • Save aheld/6689311 to your computer and use it in GitHub Desktop.
Save aheld/6689311 to your computer and use it in GitHub Desktop.
Quick and Dirty way to extract names our out of a PDF file and into a CSV file. Inspired (copied) from http://thottingal.in/blog/2009/06/24/pdfbox-extract-text-from-pdf/
package com.aaronheld;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main {
static String name;
static String add;
static String number;
public static void main(String[] args) {
String text = pdftoText(args[0]);
String[] lines = text.split("\n");
Pattern reName = Pattern.compile("^(.+) Call Resu.*");
Pattern reAdd = Pattern.compile("^(.+) Not Home .*");
Pattern reNumber = Pattern.compile("^(.+) \\d\\d \\w \\w oRe.*");
Matcher m;
for(String line: lines){
System.out.println(line);
}
System.out.println("**********************************************");
System.out.println("");
for(String line: lines){
m = reName.matcher(line);
if (m.find()){
if (add != null) printline();
name = m.group(1);
add = null;
number = null;
}
m = reAdd.matcher(line);
if (m.find()){
add = m.group(1);
}
m = reNumber.matcher(line);
if (m.find()){
number = m.group(1);
}
}
if (add != null) printline();
System.out.println();
}
static void printline(){
String output = String.format("\"%s\",\"%s\",\"%s\"", name, add, number);
System.out.println(output);
}
static String pdftoText(String fileName){
PDFParser parser;
String parsedText = null;;
PDFTextStripper pdfStripper = null;
PDDocument pdDoc = null;
COSDocument cosDoc = null;
File file = new File(fileName);
if (!file.isFile()) {
System.err.println("File " + fileName + " does not exist.");
return null;
}
try {
parser = new PDFParser(new FileInputStream(file));
} catch (IOException e) {
System.err.println("Unable to open PDF Parser. " + e.getMessage());
return null;
}
try {
parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();
pdDoc = new PDDocument(cosDoc);
pdfStripper.setStartPage(1);
parsedText = pdfStripper.getText(pdDoc);
} catch (Exception e) {
System.err
.println("An exception occured in parsing the PDF Document."
+ e.getMessage());
} finally {
try {
if (cosDoc != null)
cosDoc.close();
if (pdDoc != null)
pdDoc.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return parsedText;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment