bmanojkumar/pdfextract.java

## pdfextract.java
import java.io.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.util.*;

public class pdfextract {

    public static void main(String[] args){
        PDDocument pd;
        BufferedWriter wr;
        try {
            File input = new File("final.pdf");  // The PDF file from where you would like to extract
            File output = new File("SampleText.txt"); // The text file where you are going to store the extracted data
            pd = PDDocument.load(input);
            System.out.println(pd.getNumberOfPages());
            System.out.println(pd.isEncrypted());
            pd.save("CopyOfInvoice.pdf"); // Creates a copy called "CopyOfInvoice.pdf"
            PDFTextStripper stripper = new PDFTextStripper();
            String text = stripper.getText(pd);


            String[] a = text.split("\n");
            System.out.println(a.length);

            Pattern p = Pattern.compile("-?\\d+");


            for(int i=0;i<a.length;i++) {
                    if(a[i].trim().equalsIgnoreCase("Education")) {
                        int c = 10;

                        String[] temp = a[i+1].split(" ");

                        for(String g:temp) {
                            Matcher m = p.matcher(g);
                            while (m.find()) {
                                System.out.println(m.group());
                            }
                        }


                    }
                        //System.out.println(a[i]);
            }

            /*stripper.setStartPage(1); //Start extracting from page 3
            stripper.setEndPage(1); //Extract till page 5
            wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
            stripper.writeText(pd, wr);
            if (pd != null) {
                pd.close();
            }
            // I use close() to flush the stream.
            wr.close();*/
        } catch (Exception e){
            e.printStackTrace();
        }
    }
}
	import java.io.*;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.pdfbox.pdmodel.*;
	import org.apache.pdfbox.util.*;

	public class pdfextract {

	public static void main(String[] args){
	PDDocument pd;
	BufferedWriter wr;
	try {
	File input = new File("final.pdf"); // The PDF file from where you would like to extract
	File output = new File("SampleText.txt"); // The text file where you are going to store the extracted data
	pd = PDDocument.load(input);
	System.out.println(pd.getNumberOfPages());
	System.out.println(pd.isEncrypted());
	pd.save("CopyOfInvoice.pdf"); // Creates a copy called "CopyOfInvoice.pdf"
	PDFTextStripper stripper = new PDFTextStripper();
	String text = stripper.getText(pd);



	String[] a = text.split("\n");
	System.out.println(a.length);

	Pattern p = Pattern.compile("-?\\d+");



	for(int i=0;i<a.length;i++) {
	if(a[i].trim().equalsIgnoreCase("Education")) {
	int c = 10;

	String[] temp = a[i+1].split(" ");

	for(String g:temp) {
	Matcher m = p.matcher(g);
	while (m.find()) {
	System.out.println(m.group());
	}
	}







	}
	//System.out.println(a[i]);
	}

	/*stripper.setStartPage(1); //Start extracting from page 3
	stripper.setEndPage(1); //Extract till page 5
	wr = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output)));
	stripper.writeText(pd, wr);
	if (pd != null) {
	pd.close();
	}
	// I use close() to flush the stream.
	wr.close();*/
	} catch (Exception e){
	e.printStackTrace();
	}
	}
	}