Skip to content

Instantly share code, notes, and snippets.

@nhojpatrick
Created January 18, 2016 16:15
Show Gist options
  • Save nhojpatrick/ac01a3b3d791364b26f8 to your computer and use it in GitHub Desktop.
Save nhojpatrick/ac01a3b3d791364b26f8 to your computer and use it in GitHub Desktop.
TikaDetectTester for tika user mailing list
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
public class TikaDetectTester {
public static void main(final String[] args) throws Exception {
final FileInputStream fis = new FileInputStream(args[0]);
final BufferedInputStream bis = new BufferedInputStream(fis);
bis.mark(Integer.MAX_VALUE - 8);
final int bufferSize = bis.available();
try (final FileOutputStream fileOutputStream = new FileOutputStream(args[0] + ".out1");) {
byte[] buffer = new byte[bufferSize];
int noOfBytes = 0;
while ((noOfBytes = bis.read(buffer)) != -1) {
fileOutputStream.write(buffer, 0, noOfBytes);
}
}
bis.reset();
final TikaInputStream tis = TikaInputStream.get(bis);
final TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
final Detector tikaDetector = tikaConfig.getDetector();
final Metadata metadata = new Metadata();
final MediaType detected = tikaDetector.detect(tis, metadata);
System.out.println("Tika Detected Media Type = " + String.valueOf(detected) + ";");
try (final InputStream inputStream = bis;
final FileOutputStream fileOutputStream = new FileOutputStream(args[0] + ".out2");) {
byte[] buffer = new byte[bufferSize];
int noOfBytes = 0;
while ((noOfBytes = bis.read(buffer)) != -1) {
fileOutputStream.write(buffer, 0, noOfBytes);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment