rishuatgithub/SimplePDFTableExtraction

## SimplePDFTableExtraction
package com.rks.git.pdf.parser.build;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.pdfbox.pdmodel.PDDocument;
import technology.tabula.ObjectExtractor;
import technology.tabula.Page;
import technology.tabula.RectangularTextContainer;
import technology.tabula.Table;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

public class PDFParserMaster {

	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		final String FILENAME="C:\\Users\\Rishu\\Downloads\\PDF Parser Data Test.pdf";

		PDDocument pd = PDDocument.load(new File(FILENAME));

		int totalPages = pd.getNumberOfPages();
		System.out.println("Total Pages in Document: "+totalPages);

		ObjectExtractor oe = new ObjectExtractor(pd);
		SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
		Page page = oe.extract(1);

		// extract text from the table after detecting
		List<Table> table = sea.extract(page);
		for(Table tables: table) {
			List<List<RectangularTextContainer>> rows = tables.getRows();

			for(int i=0; i<rows.size(); i++) {

				List<RectangularTextContainer> cells = rows.get(i);

				for(int j=0; j<cells.size(); j++) {
					System.out.print(cells.get(j).getText()+"|");
				}

				System.out.println();
			}
		}

	}

}
	package com.rks.git.pdf.parser.build;

	import java.io.File;
	import java.io.IOException;
	import java.util.List;

	import org.apache.pdfbox.pdmodel.PDDocument;
	import technology.tabula.ObjectExtractor;
	import technology.tabula.Page;
	import technology.tabula.RectangularTextContainer;
	import technology.tabula.Table;
	import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

	public class PDFParserMaster {

	public static void main(String[] args) throws IOException {
	// TODO Auto-generated method stub
	final String FILENAME="C:\\Users\\Rishu\\Downloads\\PDF Parser Data Test.pdf";

	PDDocument pd = PDDocument.load(new File(FILENAME));

	int totalPages = pd.getNumberOfPages();
	System.out.println("Total Pages in Document: "+totalPages);

	ObjectExtractor oe = new ObjectExtractor(pd);
	SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
	Page page = oe.extract(1);

	// extract text from the table after detecting
	List<Table> table = sea.extract(page);
	for(Table tables: table) {
	List<List<RectangularTextContainer>> rows = tables.getRows();

	for(int i=0; i<rows.size(); i++) {

	List<RectangularTextContainer> cells = rows.get(i);

	for(int j=0; j<cells.size(); j++) {
	System.out.print(cells.get(j).getText()+"\|");
	}

	System.out.println();
	}
	}

	}

	}