Skip to content

Instantly share code, notes, and snippets.

@kinjouj
Created April 27, 2012 09:16
Show Gist options
  • Save kinjouj/2507727 to your computer and use it in GitHub Desktop.
Save kinjouj/2507727 to your computer and use it in GitHub Desktop.
PDF Extract Text Using Apache Tika
<?xml version="1.0" ?>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>net.kinjouj.tika</groupId>
<artifactId>kinjouj_tika</artifactId>
<version>1.0</version>
<name>kinjouj_tika</name>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.2.1</version>
<configuration>
<mainClass>Sample</mainClass>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.1</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.1</version>
</dependency>
</dependencies>
</project>
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class Sample {
public static void main(String[] args){
InputStream is = null;
try {
is = new BufferedInputStream(new FileInputStream(new File("sample.pdf")));
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler(System.out);
Metadata metadata = new Metadata();
parser.parse(is, handler, metadata, new ParseContext());
for (String name : metadata.names()) {
String value = metadata.get(name);
if (value != null) {
System.out.println("Metadata Name: " + name);
System.out.println("Metadata Value: " + value);
}
}
} catch (IOException e) {
e.printStackTrace();
} catch (TikaException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch(IOException e) {
e.printStackTrace();
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment