Skip to content

Instantly share code, notes, and snippets.

@bmanojkumar
Created August 7, 2014 18:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bmanojkumar/d37659c4eae8f2ee9f52 to your computer and use it in GitHub Desktop.
Save bmanojkumar/d37659c4eae8f2ee9f52 to your computer and use it in GitHub Desktop.
import java.io.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.util.*;
public class pdfextract {
public static void main(String[] args){
PDDocument pd;
BufferedWriter wr;
try {
File input = new File("final.pdf"); // The PDF file from where you would like to extract
File output = new File("SampleText.txt"); // The text file where you are going to store the extracted data
pd = PDDocument.load(input);
System.out.println(pd.getNumberOfPages());
System.out.println(pd.isEncrypted());
pd.save("CopyOfInvoice.pdf"); // Creates a copy called "CopyOfInvoice.pdf"
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(pd);
String[] a = text.split("\n");
System.out.println(a.length);
Pattern p = Pattern.compile("-?\\d+");
for(int i=0;i<a.length;i++) {
if(a[i].trim().equalsIgnoreCase("Education")) {
int c = 10;
String[] temp = a[i+1].split(" ");
for(String g:temp) {
Matcher m = p.matcher(g);
while (m.find()) {
System.out.println(m.group());
}
}
}
//System.out.println(a[i]);
}
/*stripper.setStartPage(1); //Start extracting from page 3
stripper.setEndPage(1); //Extract till page 5
wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
stripper.writeText(pd, wr);
if (pd != null) {
pd.close();
}
// I use close() to flush the stream.
wr.close();*/
} catch (Exception e){
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment