Last active
April 13, 2016 18:07
-
-
Save fxfixer/e54f86095a548cbfb8aeb948ff77a41b to your computer and use it in GitHub Desktop.
Tika DBF file detector
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package org.apache.tika.parser.dbf; | |
import java.io.EOFException; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import org.apache.commons.io.IOUtils; | |
import org.apache.tika.detect.Detector; | |
import org.apache.tika.io.EndianUtils; | |
import org.apache.tika.io.EndianUtils.BufferUnderrunException; | |
import org.apache.tika.io.TikaInputStream; | |
import org.apache.tika.metadata.Metadata; | |
import org.apache.tika.mime.MediaType; | |
import com.google.common.math.LongMath; | |
/** | |
* Detect DBF files by checking for a DBF Header | |
* | |
* @author Nick C | |
*/ | |
public class DBFDetector implements Detector { | |
private static final long serialVersionUID = 1L; | |
private static final MediaType DBF_TYPE = MediaType.application("x-dbf"); | |
private static final long MAX_FILE_SIZE = 0xFFFFFFFFL; | |
private static final byte FIELD_HEADER_SIZE = 32; | |
// 32 for header + 32 for first field header + 1 for terminator | |
private static final int MIN_HEADER_LENGTH = FIELD_HEADER_SIZE * 2 + 1; | |
private static final byte[] MAX_DAY_OF_MONTH = | |
new byte[] {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; | |
@Override | |
public MediaType detect(InputStream input, Metadata metadata) throws IOException { | |
if (input != null && hasValidDBFHeader(input, metadata)) { | |
return DBF_TYPE; | |
} else { | |
return MediaType.OCTET_STREAM; | |
} | |
} | |
public static boolean hasValidDBFHeader(InputStream input, Metadata metadata) | |
throws IOException { | |
input.mark(MIN_HEADER_LENGTH); | |
try { | |
int signature = input.read(); | |
if (signature != 3) { | |
// Only handle DBase III | |
// TODO Use DbfFileTypeEnum in jdbf to make sure type is supported | |
return false; | |
} | |
// YYMMDD (Bytes 1-3) | |
if (!validateUpdateDate(input)) { | |
return false; | |
} | |
// Unsigned Int (Bytes 4-7) | |
long recordCount = EndianUtils.readIntLE(input) & 0xFFFFFFFFL; | |
if (recordCount == 0 || recordCount >= MAX_FILE_SIZE) { | |
return false; | |
} | |
int headerLength = EndianUtils.readUShortLE(input); | |
if (headerLength < MIN_HEADER_LENGTH) { | |
return false; | |
} else if (headerLength % 32 != 1) { | |
// Must have an extra byte for header terminator | |
return false; | |
} else if (headerLength / 32 > 255) { | |
// Too many fields | |
return false; | |
} | |
// Bytes 10-11 | |
int recordLength = EndianUtils.readUShortLE(input); | |
if (recordLength < 1) { | |
// TODO Check if first field length is greater than the record length? | |
return false; | |
} | |
// Make sure overflows are handled and add 1 for the last EOF marker | |
// TODO Use Math.multiplyExact in Java 8 | |
long totalFileSize = LongMath.checkedAdd( | |
LongMath.checkedMultiply(recordCount, recordLength), headerLength + 1); | |
if (totalFileSize > MAX_FILE_SIZE) { | |
return false; | |
} | |
long inputSize = tryGetLength(input, metadata); | |
if (inputSize != -1) { | |
// We know the input size compare against the calculated file size (Be lenient) | |
// This could be wrong if the file is packed and has extra junk after the last | |
// record (Hopefully they have a matching file extension) | |
if (Math.abs(totalFileSize - inputSize) > 4) { | |
return false; | |
} | |
} | |
// Skip reserved | |
IOUtils.skipFully(input, 2); | |
// Incomplete transaction flag (Byte 14) | |
if (!validateBoolean(input)) { | |
return false; | |
} | |
// Encryption flag (Byte 15) | |
if (!validateBoolean(input)) { | |
return false; | |
} | |
// The file size has already been validated no need to go further | |
if (inputSize != -1) { | |
return true; | |
} | |
// Skip to first field entry | |
IOUtils.skipFully(input, 16); | |
if (!validateFieldEntry(input)) { | |
return false; | |
} | |
return true; | |
} catch (BufferUnderrunException | EOFException | ArithmeticException e) { | |
// Ignore | |
return false; | |
} finally { | |
input.reset(); | |
} | |
} | |
/** | |
* Validate that the month and day are valid | |
* | |
* @param input | |
* @return | |
* @throws IOException | |
*/ | |
private static boolean validateUpdateDate(InputStream input) throws IOException { | |
// TODO : Validate year? | |
int year = input.read(); | |
if (year == -1) { | |
return false; | |
} | |
int month = input.read(); | |
if (month < 1 || month > 12) { | |
return false; | |
} | |
int day = input.read(); | |
if (day < 1 || day > MAX_DAY_OF_MONTH[month - 1]) { | |
return false; | |
} | |
return true; | |
} | |
/** | |
* Validate that the next byte is 0 or 1 | |
* | |
* @param input | |
* @return true if next byte is 0 or 1 | |
* @throws IOException | |
*/ | |
private static boolean validateBoolean(InputStream input) throws IOException { | |
int flag = input.read(); | |
return (flag == 0 || flag == 1); | |
} | |
/** | |
* Validate that the field entry has a valid type | |
* | |
* @param input | |
* @return true if field entry is valid | |
* @throws IOException | |
*/ | |
private static boolean validateFieldEntry(InputStream input) throws IOException { | |
// Make sure first byte of field name isn't a control character | |
if (input.read() < 32) { | |
return false; | |
} | |
// Skip rest of field name | |
IOUtils.skipFully(input, 10); | |
// Validate field type | |
int fieldType = input.read(); | |
if (fieldType == '0' || (fieldType >= 'A' && fieldType <= 'Z')) { | |
// TODO Use DbfFieldTypeEnum in jdbf to validate field type | |
return true; | |
} else { | |
return false; | |
} | |
} | |
/** | |
* Try to determine the InputStream length | |
* | |
* @param input | |
* @param metadata | |
* @return -1 if unable to determine length | |
* @throws IOException | |
*/ | |
private static long tryGetLength(InputStream input, Metadata metadata) throws IOException { | |
if (input instanceof TikaInputStream) { | |
TikaInputStream tis = ((TikaInputStream) input); | |
if (tis.hasLength()) { | |
return tis.getLength(); | |
} | |
} | |
String len = metadata.get(Metadata.CONTENT_LENGTH); | |
if (len != null && !len.isEmpty()) { | |
try { | |
return Long.parseLong(len); | |
} catch (NumberFormatException e) { | |
// swallow | |
} | |
} | |
return -1; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment