-
-
Save ernstki/28fbe8c4cfa76dcb08cbc2282a3c8949 to your computer and use it in GitHub Desktop.
Wrapper script for Apache Tika, optionally enable Tesseract OCR for a single invocation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## | |
## Simple wrapper script for Apache tika | |
## | |
## Usage: download the latest `tika-app*.jar` from [1] to your ~/bin | |
## and copy this script alongside. Make executable with | |
## | |
## chmod a+x ~/bin/tika` | |
## | |
## Test with `tika --help`. | |
## | |
## Author: Kevin Ernst (ernstki -at- mail.uc.edu) | |
## Date: 15 Dec 2023 | |
## License: WTFPL | |
## | |
## [1]: https://tika.apache.org/download.html | |
# set TRACE=1 in the environment to enable execution tracing | |
(( TRACE )) && set -x | |
set -u | |
MYDIR=$(cd "$(dirname "$BASH_SOURCE")" && pwd) | |
# Yes, yes, I know XDG, ~/.config, ~/Library/Application Settings and all that. | |
CONFIG="$MYDIR"/tika_no_ocr.xml | |
if [[ $1 =~ ^-(o|-ocr)$ ]]; then | |
# default mode of operation, use Tesseract if installed | |
if ! which tesseract &>/dev/null; then | |
echo "WARNING: Tesseract not found in search path. Tika won't try to OCR file contents." >&2 | |
fi | |
shift | |
else | |
if [[ ! -s "$CONFIG" ]]; then | |
# source: https://cwiki.apache.org/confluence/display/TIKA/TikaOCR#TikaOCR-disable-ocr | |
echo "Writing new parser "$CONFIG" to exclude TesseractOCRParser." >&2 | |
cat <<END_OF_XML >"$CONFIG" | |
<?xml version="1.0" encoding="UTF-8"?> | |
<properties> | |
<parsers> | |
<parser class="org.apache.tika.parser.DefaultParser"> | |
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/> | |
</parser> | |
</parsers> | |
</properties> | |
END_OF_XML | |
fi | |
echo "Tesseract OCR is disabled for this run; try '-o' / '--ocr' to enable it." >&2 | |
set -- --config="$CONFIG" "$@" | |
fi | |
java -jar "$MYDIR"/tika-app-*.jar "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment