Skip to content

Instantly share code, notes, and snippets.

@fxfixer
Last active April 13, 2016 18:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fxfixer/e54f86095a548cbfb8aeb948ff77a41b to your computer and use it in GitHub Desktop.
Save fxfixer/e54f86095a548cbfb8aeb948ff77a41b to your computer and use it in GitHub Desktop.
Tika DBF file detector
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.dbf;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.io.EndianUtils.BufferUnderrunException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import com.google.common.math.LongMath;
/**
* Detect DBF files by checking for a DBF Header
*
* @author Nick C
*/
public class DBFDetector implements Detector {
private static final long serialVersionUID = 1L;
private static final MediaType DBF_TYPE = MediaType.application("x-dbf");
private static final long MAX_FILE_SIZE = 0xFFFFFFFFL;
private static final byte FIELD_HEADER_SIZE = 32;
// 32 for header + 32 for first field header + 1 for terminator
private static final int MIN_HEADER_LENGTH = FIELD_HEADER_SIZE * 2 + 1;
private static final byte[] MAX_DAY_OF_MONTH =
new byte[] {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
@Override
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
if (input != null && hasValidDBFHeader(input, metadata)) {
return DBF_TYPE;
} else {
return MediaType.OCTET_STREAM;
}
}
public static boolean hasValidDBFHeader(InputStream input, Metadata metadata)
throws IOException {
input.mark(MIN_HEADER_LENGTH);
try {
int signature = input.read();
if (signature != 3) {
// Only handle DBase III
// TODO Use DbfFileTypeEnum in jdbf to make sure type is supported
return false;
}
// YYMMDD (Bytes 1-3)
if (!validateUpdateDate(input)) {
return false;
}
// Unsigned Int (Bytes 4-7)
long recordCount = EndianUtils.readIntLE(input) & 0xFFFFFFFFL;
if (recordCount == 0 || recordCount >= MAX_FILE_SIZE) {
return false;
}
int headerLength = EndianUtils.readUShortLE(input);
if (headerLength < MIN_HEADER_LENGTH) {
return false;
} else if (headerLength % 32 != 1) {
// Must have an extra byte for header terminator
return false;
} else if (headerLength / 32 > 255) {
// Too many fields
return false;
}
// Bytes 10-11
int recordLength = EndianUtils.readUShortLE(input);
if (recordLength < 1) {
// TODO Check if first field length is greater than the record length?
return false;
}
// Make sure overflows are handled and add 1 for the last EOF marker
// TODO Use Math.multiplyExact in Java 8
long totalFileSize = LongMath.checkedAdd(
LongMath.checkedMultiply(recordCount, recordLength), headerLength + 1);
if (totalFileSize > MAX_FILE_SIZE) {
return false;
}
long inputSize = tryGetLength(input, metadata);
if (inputSize != -1) {
// We know the input size compare against the calculated file size (Be lenient)
// This could be wrong if the file is packed and has extra junk after the last
// record (Hopefully they have a matching file extension)
if (Math.abs(totalFileSize - inputSize) > 4) {
return false;
}
}
// Skip reserved
IOUtils.skipFully(input, 2);
// Incomplete transaction flag (Byte 14)
if (!validateBoolean(input)) {
return false;
}
// Encryption flag (Byte 15)
if (!validateBoolean(input)) {
return false;
}
// The file size has already been validated no need to go further
if (inputSize != -1) {
return true;
}
// Skip to first field entry
IOUtils.skipFully(input, 16);
if (!validateFieldEntry(input)) {
return false;
}
return true;
} catch (BufferUnderrunException | EOFException | ArithmeticException e) {
// Ignore
return false;
} finally {
input.reset();
}
}
/**
* Validate that the month and day are valid
*
* @param input
* @return
* @throws IOException
*/
private static boolean validateUpdateDate(InputStream input) throws IOException {
// TODO : Validate year?
int year = input.read();
if (year == -1) {
return false;
}
int month = input.read();
if (month < 1 || month > 12) {
return false;
}
int day = input.read();
if (day < 1 || day > MAX_DAY_OF_MONTH[month - 1]) {
return false;
}
return true;
}
/**
* Validate that the next byte is 0 or 1
*
* @param input
* @return true if next byte is 0 or 1
* @throws IOException
*/
private static boolean validateBoolean(InputStream input) throws IOException {
int flag = input.read();
return (flag == 0 || flag == 1);
}
/**
* Validate that the field entry has a valid type
*
* @param input
* @return true if field entry is valid
* @throws IOException
*/
private static boolean validateFieldEntry(InputStream input) throws IOException {
// Make sure first byte of field name isn't a control character
if (input.read() < 32) {
return false;
}
// Skip rest of field name
IOUtils.skipFully(input, 10);
// Validate field type
int fieldType = input.read();
if (fieldType == '0' || (fieldType >= 'A' && fieldType <= 'Z')) {
// TODO Use DbfFieldTypeEnum in jdbf to validate field type
return true;
} else {
return false;
}
}
/**
* Try to determine the InputStream length
*
* @param input
* @param metadata
* @return -1 if unable to determine length
* @throws IOException
*/
private static long tryGetLength(InputStream input, Metadata metadata) throws IOException {
if (input instanceof TikaInputStream) {
TikaInputStream tis = ((TikaInputStream) input);
if (tis.hasLength()) {
return tis.getLength();
}
}
String len = metadata.get(Metadata.CONTENT_LENGTH);
if (len != null && !len.isEmpty()) {
try {
return Long.parseLong(len);
} catch (NumberFormatException e) {
// swallow
}
}
return -1;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment