Created
September 24, 2013 18:41
-
-
Save aheld/6689311 to your computer and use it in GitHub Desktop.
Quick and Dirty way to extract names our out of a PDF file and into a CSV file. Inspired (copied) from http://thottingal.in/blog/2009/06/24/pdfbox-extract-text-from-pdf/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.aaronheld; | |
import org.apache.pdfbox.cos.COSDocument; | |
import org.apache.pdfbox.pdfparser.PDFParser; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.util.PDFTextStripper; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.IOException; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class Main { | |
static String name; | |
static String add; | |
static String number; | |
public static void main(String[] args) { | |
String text = pdftoText(args[0]); | |
String[] lines = text.split("\n"); | |
Pattern reName = Pattern.compile("^(.+) Call Resu.*"); | |
Pattern reAdd = Pattern.compile("^(.+) Not Home .*"); | |
Pattern reNumber = Pattern.compile("^(.+) \\d\\d \\w \\w oRe.*"); | |
Matcher m; | |
for(String line: lines){ | |
System.out.println(line); | |
} | |
System.out.println("**********************************************"); | |
System.out.println(""); | |
for(String line: lines){ | |
m = reName.matcher(line); | |
if (m.find()){ | |
if (add != null) printline(); | |
name = m.group(1); | |
add = null; | |
number = null; | |
} | |
m = reAdd.matcher(line); | |
if (m.find()){ | |
add = m.group(1); | |
} | |
m = reNumber.matcher(line); | |
if (m.find()){ | |
number = m.group(1); | |
} | |
} | |
if (add != null) printline(); | |
System.out.println(); | |
} | |
static void printline(){ | |
String output = String.format("\"%s\",\"%s\",\"%s\"", name, add, number); | |
System.out.println(output); | |
} | |
static String pdftoText(String fileName){ | |
PDFParser parser; | |
String parsedText = null;; | |
PDFTextStripper pdfStripper = null; | |
PDDocument pdDoc = null; | |
COSDocument cosDoc = null; | |
File file = new File(fileName); | |
if (!file.isFile()) { | |
System.err.println("File " + fileName + " does not exist."); | |
return null; | |
} | |
try { | |
parser = new PDFParser(new FileInputStream(file)); | |
} catch (IOException e) { | |
System.err.println("Unable to open PDF Parser. " + e.getMessage()); | |
return null; | |
} | |
try { | |
parser.parse(); | |
cosDoc = parser.getDocument(); | |
pdfStripper = new PDFTextStripper(); | |
pdDoc = new PDDocument(cosDoc); | |
pdfStripper.setStartPage(1); | |
parsedText = pdfStripper.getText(pdDoc); | |
} catch (Exception e) { | |
System.err | |
.println("An exception occured in parsing the PDF Document." | |
+ e.getMessage()); | |
} finally { | |
try { | |
if (cosDoc != null) | |
cosDoc.close(); | |
if (pdDoc != null) | |
pdDoc.close(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
return parsedText; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment