Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bitsnaps/30e66d5f4e127b2d2cb3854aad2277a9 to your computer and use it in GitHub Desktop.
Save bitsnaps/30e66d5f4e127b2d2cb3854aad2277a9 to your computer and use it in GitHub Desktop.
A Groovy script that extracts metadata from files using Apache Tika. Works recursively on a file hierarchy and writes all found metadata into a single xml file.
/**
* Copyright 2013 Kai Sternad
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import groovy.xml.StreamingMarkupBuilder
import org.apache.tika.sax.BodyContentHandler
import org.apache.tika.config.TikaConfig
import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.AutoDetectParser
import org.apache.tika.parser.ParseContext
import org.apache.tika.parser.Parser
import org.apache.tika.metadata.TikaCoreProperties;
import static org.apache.tika.metadata.TikaCoreProperties.*;
import org.apache.commons.io.FilenameUtils
import groovy.util.logging.Slf4j
import java.security.MessageDigest
/**
* This script extracts some of the Dublin Core metadata fields (and some file metadata) from files and writes them to an xml structure on disk.
* It recursively traverses the file system, starting from the provided directory.<br/>
* Any file having a suffix defined in <code>ALLOWED_FILES</code> will be processed.<br/>
* The metadata fields to be extracted are defined in <code>METADATA_FIELDS</code><br/>
*
*
* This script depends on a working installation of Groovy / Grape and must be invoked from the command line with:
* <pre>groovy TikaService.groovy <root directory></pre>
*
*
* A file named "metadata.xml" is written into the directory the script is executed in.<br/>
* It has the following structure:
* <pre>
* {@code
* <?xml version="1.0" encoding="UTF-8"?>
* <root xmlns:dc="http://purl.org/dc/elements/1.1/"
* xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0"
* xmlns:dcterms="http://purl.org/dc/terms/">
* <parsedFile>
* <fileMetadata name="helloworld.pdf">
* <Content-Type>application/pdf</Content-Type>
* <dc:title>Hello, World</dc:title>
* <dcterms:created>2009-06-25T12:42:58Z</dcterms:created>
* <dcterms:modified>2011-06-25T12:43:16Z</dcterms:modified>
* <file-md5>a8158833d3b1d341e705a4c268a7264c</file-md5>
* ...
* </fileMetadata>
* </root>
* }
* </pre>
* @author Kai Sternad
* @date 2013/10/20
*/
@Grab(group='org.apache.tika', module='tika-parsers', version='1.4')
@Grab(group='commons-io', module='commons-io', version='2.4')
@Grab(group='ch.qos.logback', module='logback-classic', version='1.0.13')
@Slf4j
class TikaService {
public static void main(def args){
if (args.length != 1){
println "please supply the root directory for the metadata extraction"
System.exit(-1)
}
def root = new File(args[0])
if (! root.isDirectory()){
println "must be a directory"
System.exit(-1)
}
List<File> files = new RecursiveFileFinder().findSuitableFiles(root);
def mde = new MetadataExtractor()
File metadataFile = new File("metadata.xml");
new TikaService().metaToXml(files, metadataFile, mde)
}
public void metaToXml(List files, File outfile, MetadataExtractor mde){
def fieldList = []
log.info("number of allowed files: " + files.size())
def builder = new StreamingMarkupBuilder()
new OutputStreamWriter(new FileOutputStream(outfile),'utf-8') << builder.bind{
mkp.xmlDeclaration(version: "1.0", encoding: "utf-8")
mkp.declareNamespace("dc": "http://purl.org/dc/elements/1.1/")
mkp.declareNamespace("dcterms": "http://purl.org/dc/terms/")
mkp.declareNamespace("meta": "urn:oasis:names:tc:opendocument:xmlns:meta:1.0")
root(){
files.each{ file ->
log.info("about to parse: ${file.name}")
Map metaDataFields = mde.getMetadataForFile(file);
parsedFile(){
fileMetadata(name:file.name){
metaDataFields.each { "$it.key"(it.value) }
}
}
}
}
}
}
static class RecursiveFileFinder{
private static final List ALLOWED_FILES = [
"mp4",
"ai",
"asf",
"gif",
"info",
"txt",
"xlsm",
"tif",
"dmg",
"pps",
"xml",
"sample",
"dot",
"eps",
"mp3",
"docx",
"xls",
"jpg",
"zip",
"ppt",
"pdf",
"doc"
]
public List<File> findSuitableFiles(File rootDir){
def files = []
rootDir.eachFileRecurse{ file ->
// No directories, only files
if (!file.isDirectory()) {
// Only allowed suffixes
def suffix = FilenameUtils.getExtension(file.name)
if (ALLOWED_FILES.contains(suffix.toLowerCase())){
files << file
}
}
}
return files
}
}
@Slf4j
static class MetadataExtractor{
/**
* List of Metadata fields to be extracted.
* Change these if you would like to extract different fields
*/
private static final List METADATA_FIELDS = [
Metadata.CONTENT_TYPE,
TITLE,
CREATOR,
MODIFIER,
RIGHTS,
CREATED,
MODIFIED,
COVERAGE,
KEYWORDS,
DESCRIPTION
]
public Map getMetadataForFile(File file){
Metadata metadata = parseFile(file)
String md5 = generateMD5(file)
Map extractedMetadata = extractMetadata(metadata)
extractedMetadata << ["file-md5" : md5]
return extractedMetadata
}
private Map extractMetadata(Metadata tikaMeta){
def nonEmptyFields = [:]
METADATA_FIELDS.each{ field ->
def extractedMetadataField = tikaMeta.get(field);
if (extractedMetadataField){
String key = field.class.equals(String.class) ? field : field.name
nonEmptyFields << ["$key":extractedMetadataField]
}
}
return nonEmptyFields;
}
private String generateMD5(File f) {
MessageDigest digest = MessageDigest.getInstance("MD5")
digest.update(f.getBytes());
new BigInteger(1, digest.digest()).toString(16).padLeft(32, '0')
}
private Metadata parseFile(File file){
FileInputStream stream = new FileInputStream(file);
TikaConfig tikaConfig = new TikaConfig()
Metadata tikaMeta = new Metadata()
BodyContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser(tikaConfig)
try {
parser.parse(stream, handler, tikaMeta)
log.debug("parsed file {$file.absolutePath}")
} catch (Exception e) {
log.error("Failed to parse file ${file.absolutePath} ${e}")
}
return tikaMeta
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment