Created
July 6, 2019 12:53
-
-
Save rishuatgithub/f864fd8ea7be992bb985da4372094370 to your computer and use it in GitHub Desktop.
Extract Tables From PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.rks.git.pdf.parser.build; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.List; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import technology.tabula.ObjectExtractor; | |
import technology.tabula.Page; | |
import technology.tabula.RectangularTextContainer; | |
import technology.tabula.Table; | |
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; | |
public class PDFParserMaster { | |
public static void main(String[] args) throws IOException { | |
// TODO Auto-generated method stub | |
final String FILENAME="C:\\Users\\Rishu\\Downloads\\PDF Parser Data Test.pdf"; | |
PDDocument pd = PDDocument.load(new File(FILENAME)); | |
int totalPages = pd.getNumberOfPages(); | |
System.out.println("Total Pages in Document: "+totalPages); | |
ObjectExtractor oe = new ObjectExtractor(pd); | |
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); | |
Page page = oe.extract(1); | |
// extract text from the table after detecting | |
List<Table> table = sea.extract(page); | |
for(Table tables: table) { | |
List<List<RectangularTextContainer>> rows = tables.getRows(); | |
for(int i=0; i<rows.size(); i++) { | |
List<RectangularTextContainer> cells = rows.get(i); | |
for(int j=0; j<cells.size(); j++) { | |
System.out.print(cells.get(j).getText()+"|"); | |
} | |
System.out.println(); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thank you very much rish i was maddly looking for the algorithm of table identified. you know i have tried on tabula-py and build this api for table data extraction because python having rich library . now we have a solution in java