Skip to content

Instantly share code, notes, and snippets.

@zhouhoo
Created January 6, 2017 07:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save zhouhoo/51f1ff85d0bacb7446fb84a6bb516129 to your computer and use it in GitHub Desktop.
Save zhouhoo/51f1ff85d0bacb7446fb84a6bb516129 to your computer and use it in GitHub Desktop.
use tika to convert pdf file to txt.
//although it is hard job to convert pdf to text, tika tool is cool for this. it can auto detect pdf format and choose parser to parse the pdf.
package june;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class TikeParser {
public static void main(String[] args){
InputStream is = null;
OutputStream out =null;
//String outPutFile="E:/notice/errorpdf/1.txt";
String dirPathIn="C:/Users/Administrator/Desktop/finance_year/";
String dirPathOut="E:/notice/finance_year_txt_s/";
File dir = new File(dirPathIn);
String[] filenames = dir.list();
int lenFiles = filenames.length;
try {
for(int i=0;i<lenFiles;i++){
System.out.println(i+" : "+filenames[i]);
is = new BufferedInputStream(new FileInputStream(new File(dirPathIn+filenames[i])));
out= new FileOutputStream(dirPathOut+i+".txt");
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler(out);
Metadata metadata = new Metadata();
parser.parse(is, handler, metadata, new ParseContext());
}
// for (String name : metadata.names()) {
// String value = metadata.get(name);
//
// if (value != null) {
// System.out.println("Metadata Name: " + name);
// System.out.println("Metadata Value: " + value);
// }
// }
} catch (IOException e) {
e.printStackTrace();
} catch (TikaException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
out.close();
} catch(IOException e) {
e.printStackTrace();
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment