Heilum/Test.java

## pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.swordfish.readpdf</groupId>
	<artifactId>readpdf</artifactId>
	<version>0.0.1</version>
	<dependencies>
		<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
		<dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>pdfbox</artifactId>
			<version>2.0.10</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
		<dependency>
			<groupId>org.apache.tika</groupId>
			<artifactId>tika-parsers</artifactId>
			<version>1.18</version>
		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
		<dependency>
			<groupId>org.apache.tika</groupId>
			<artifactId>tika-core</artifactId>
			<version>1.18</version>
		</dependency>

		<dependency>
			<groupId>com.levigo.jbig2</groupId>
			<artifactId>levigo-jbig2-imageio</artifactId>
			<version>1.6.5</version>
		</dependency>
		<dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>jbig2-imageio</artifactId>
			<version>3.0.0</version>
		</dependency>

		<dependency>
			<groupId>org.xerial</groupId>
			<artifactId>sqlite-jdbc</artifactId>
			<version>3.23.1</version>
		</dependency>


		<dependency>
			<groupId>com.github.jai-imageio</groupId>
			<artifactId>jai-imageio-core</artifactId>
			<version>1.4.0</version>
		</dependency>
		<dependency>
			<groupId>com.github.jai-imageio</groupId>
			<artifactId>jai-imageio-jpeg2000</artifactId>
			<version>1.3.0</version>
		</dependency>

	</dependencies>
</project>

## Test.java


import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;

/**
1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki
2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the "tessdata" folder
3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/
*/

public class Main {

	public static void main(String[] args) {
		// TODO Auto-generated method stub

		System.err.println(getTextFromTesseract("/work/projects/projects-2018/read_pdf2/vr.pdf"));
	}


	public static String getTextFromPdfByTika(String filePath) {
		  File file = new File(filePath);
          String content;
		try {
			content = new Tika().parseToString(file);
			return content;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (TikaException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return "";

	}

	public static String getTextFromTesseract(String filePath) {
		try {
			InputStream pdf = Files.newInputStream(Paths.get(filePath));
			ByteArrayOutputStream out = new ByteArrayOutputStream();

			TikaConfig config = TikaConfig.getDefaultConfig();
			// TikaConfig fromFile = new TikaConfig("/path/to/file");
			BodyContentHandler handler = new BodyContentHandler(out);

			Parser parser = new AutoDetectParser(config);
			Metadata meta = new Metadata();
			ParseContext parsecontext = new ParseContext();


			PDFParserConfig pdfConfig = new PDFParserConfig();
			pdfConfig.setExtractInlineImages(true);

			TesseractOCRConfig tesserConfig = new TesseractOCRConfig();
			tesserConfig.setLanguage("chi_sim");
			tesserConfig.setTesseractPath("/usr/local/Cellar/tesseract/3.05.01/bin");
			//把chi_sim.traineddata放置在tessdata目录下
			tesserConfig.setTessdataPath("/usr/local/Cellar/tesseract/3.05.01/share/tessdata");

			parsecontext.set(Parser.class, parser);
			parsecontext.set(PDFParserConfig.class, pdfConfig);
			parsecontext.set(TesseractOCRConfig.class, tesserConfig);

			parser.parse(pdf, handler, meta, parsecontext);

			String s = new String(out.toByteArray(),Charset.defaultCharset());

			return s;

		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			return "";
		}

	}

	/**
	 *
	 * @Title: getTextFromPdf
	 * @Description: 读取pdf文件内容
	 * @param filePath
	 * @return: 读出的pdf的内容
	 */
	public static String getTextFromPdf(String filePath) {


		PDDocument pdDoc;
		try {
			pdDoc = PDDocument.load(new File(filePath));
			PDFTextStripper pdfStripper = new PDFTextStripper();

			String result = pdfStripper.getText(pdDoc);

			return result;
		} catch (InvalidPasswordException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return "";

	}

}
	<project xmlns="http://maven.apache.org/POM/4.0.0"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>com.swordfish.readpdf</groupId>
	<artifactId>readpdf</artifactId>
	<version>0.0.1</version>
	<dependencies>
	<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
	<dependency>
	<groupId>org.apache.pdfbox</groupId>
	<artifactId>pdfbox</artifactId>
	<version>2.0.10</version>
	</dependency>

	<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
	<dependency>
	<groupId>org.apache.tika</groupId>
	<artifactId>tika-parsers</artifactId>
	<version>1.18</version>
	</dependency>

	<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
	<dependency>
	<groupId>org.apache.tika</groupId>
	<artifactId>tika-core</artifactId>
	<version>1.18</version>
	</dependency>

	<dependency>
	<groupId>com.levigo.jbig2</groupId>
	<artifactId>levigo-jbig2-imageio</artifactId>
	<version>1.6.5</version>
	</dependency>
	<dependency>
	<groupId>org.apache.pdfbox</groupId>
	<artifactId>jbig2-imageio</artifactId>
	<version>3.0.0</version>
	</dependency>

	<dependency>
	<groupId>org.xerial</groupId>
	<artifactId>sqlite-jdbc</artifactId>
	<version>3.23.1</version>
	</dependency>


	<dependency>
	<groupId>com.github.jai-imageio</groupId>
	<artifactId>jai-imageio-core</artifactId>
	<version>1.4.0</version>
	</dependency>
	<dependency>
	<groupId>com.github.jai-imageio</groupId>
	<artifactId>jai-imageio-jpeg2000</artifactId>
	<version>1.3.0</version>
	</dependency>

	</dependencies>
	</project>


	import java.io.ByteArrayOutputStream;
	import java.io.File;
	import java.io.IOException;
	import java.io.InputStream;
	import java.nio.charset.Charset;
	import java.nio.file.Files;
	import java.nio.file.Paths;

	import org.apache.pdfbox.pdmodel.PDDocument;
	import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
	import org.apache.pdfbox.text.PDFTextStripper;
	import org.apache.tika.Tika;
	import org.apache.tika.config.TikaConfig;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.parser.AutoDetectParser;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.parser.ocr.TesseractOCRConfig;
	import org.apache.tika.parser.pdf.PDFParserConfig;
	import org.apache.tika.sax.BodyContentHandler;

	/**
	1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki
	2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the "tessdata" folder
	3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/
	*/

	public class Main {

	public static void main(String[] args) {
	// TODO Auto-generated method stub

	System.err.println(getTextFromTesseract("/work/projects/projects-2018/read_pdf2/vr.pdf"));
	}


	public static String getTextFromPdfByTika(String filePath) {
	File file = new File(filePath);
	String content;
	try {
	content = new Tika().parseToString(file);
	return content;
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	} catch (TikaException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	return "";

	}

	public static String getTextFromTesseract(String filePath) {
	try {
	InputStream pdf = Files.newInputStream(Paths.get(filePath));
	ByteArrayOutputStream out = new ByteArrayOutputStream();

	TikaConfig config = TikaConfig.getDefaultConfig();
	// TikaConfig fromFile = new TikaConfig("/path/to/file");
	BodyContentHandler handler = new BodyContentHandler(out);

	Parser parser = new AutoDetectParser(config);
	Metadata meta = new Metadata();
	ParseContext parsecontext = new ParseContext();


	PDFParserConfig pdfConfig = new PDFParserConfig();
	pdfConfig.setExtractInlineImages(true);

	TesseractOCRConfig tesserConfig = new TesseractOCRConfig();
	tesserConfig.setLanguage("chi_sim");
	tesserConfig.setTesseractPath("/usr/local/Cellar/tesseract/3.05.01/bin");
	//把chi_sim.traineddata放置在tessdata目录下
	tesserConfig.setTessdataPath("/usr/local/Cellar/tesseract/3.05.01/share/tessdata");

	parsecontext.set(Parser.class, parser);
	parsecontext.set(PDFParserConfig.class, pdfConfig);
	parsecontext.set(TesseractOCRConfig.class, tesserConfig);

	parser.parse(pdf, handler, meta, parsecontext);

	String s = new String(out.toByteArray(),Charset.defaultCharset());

	return s;

	} catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	return "";
	}

	}

	/**
	*
	* @Title: getTextFromPdf
	* @Description: 读取pdf文件内容
	* @param filePath
	* @return: 读出的pdf的内容
	*/
	public static String getTextFromPdf(String filePath) {


	PDDocument pdDoc;
	try {
	pdDoc = PDDocument.load(new File(filePath));
	PDFTextStripper pdfStripper = new PDFTextStripper();

	String result = pdfStripper.getText(pdDoc);

	return result;
	} catch (InvalidPasswordException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}

	return "";

	}

	}