Skip to content

Instantly share code, notes, and snippets.

@Heilum
Last active June 2, 2023 09:51
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Heilum/af7dcc1fa26762ea459648e4d6a68fd1 to your computer and use it in GitHub Desktop.
Save Heilum/af7dcc1fa26762ea459648e4d6a68fd1 to your computer and use it in GitHub Desktop.
Apache Tika + Tesseract-OCR to scan Chinese text in pdf
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.swordfish.readpdf</groupId>
<artifactId>readpdf</artifactId>
<version>0.0.1</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.10</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.18</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.18</version>
</dependency>
<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
<version>1.6.5</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jbig2-imageio</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.xerial</groupId>
<artifactId>sqlite-jdbc</artifactId>
<version>3.23.1</version>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-jpeg2000</artifactId>
<version>1.3.0</version>
</dependency>
</dependencies>
</project>
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
/**
1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki
2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the "tessdata" folder
3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/
*/
public class Main {
public static void main(String[] args) {
// TODO Auto-generated method stub
System.err.println(getTextFromTesseract("/work/projects/projects-2018/read_pdf2/vr.pdf"));
}
public static String getTextFromPdfByTika(String filePath) {
File file = new File(filePath);
String content;
try {
content = new Tika().parseToString(file);
return content;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TikaException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return "";
}
public static String getTextFromTesseract(String filePath) {
try {
InputStream pdf = Files.newInputStream(Paths.get(filePath));
ByteArrayOutputStream out = new ByteArrayOutputStream();
TikaConfig config = TikaConfig.getDefaultConfig();
// TikaConfig fromFile = new TikaConfig("/path/to/file");
BodyContentHandler handler = new BodyContentHandler(out);
Parser parser = new AutoDetectParser(config);
Metadata meta = new Metadata();
ParseContext parsecontext = new ParseContext();
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
TesseractOCRConfig tesserConfig = new TesseractOCRConfig();
tesserConfig.setLanguage("chi_sim");
tesserConfig.setTesseractPath("/usr/local/Cellar/tesseract/3.05.01/bin");
//把chi_sim.traineddata放置在tessdata目录下
tesserConfig.setTessdataPath("/usr/local/Cellar/tesseract/3.05.01/share/tessdata");
parsecontext.set(Parser.class, parser);
parsecontext.set(PDFParserConfig.class, pdfConfig);
parsecontext.set(TesseractOCRConfig.class, tesserConfig);
parser.parse(pdf, handler, meta, parsecontext);
String s = new String(out.toByteArray(),Charset.defaultCharset());
return s;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return "";
}
}
/**
*
* @Title: getTextFromPdf
* @Description: 读取pdf文件内容
* @param filePath
* @return: 读出的pdf的内容
*/
public static String getTextFromPdf(String filePath) {
PDDocument pdDoc;
try {
pdDoc = PDDocument.load(new File(filePath));
PDFTextStripper pdfStripper = new PDFTextStripper();
String result = pdfStripper.getText(pdDoc);
return result;
} catch (InvalidPasswordException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return "";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment