Skip to content

Instantly share code, notes, and snippets.

@awhileback
Created November 12, 2021 04:52
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save awhileback/1fffac899d9321a6a9ec15bc9ec58e29 to your computer and use it in GitHub Desktop.
Save awhileback/1fffac899d9321a6a9ec15bc9ec58e29 to your computer and use it in GitHub Desktop.
solr-ize your office docs and PDFs to make them searchable
#!/bin/sh
##########################################################################################################
#
# Processor for turning PDF and Office files into JSONs for Solr.
#
# This script requires the following:
#
# - OCRmyPDF + Unpaper + Tesseract + Qpdf for text processing
# - Poppler-Utils for testing PDF text and extracting text
# - Exiftool for working with metadata
# - Curl for talking to Tika and Solr servers
# - Apache Tika app jar present on the system
# - Apache Solr server running
#
# This script also assumes the following, specific to my usage of Tika/Solr:
#
# - Tika and Solr are listening on localhost
# - A "_page_num_" field exists in your Solr schema (1)
# - A "_tags_" multiValued field exists in your Solr schema (2)
# - A "_clean_fname_" field exists in your Solr schema for sanitizzed filenames
# - The "_text_" field in your Solr schema is used for document text
# - sed commands are for POSIX (FreeBSD and OSX), they will require alteration for Linux (3)
#
# (1) I store PDF text in Solr per page, rather than text for a whole document. There are two
# resons for this. Firstly, as of this writing Solr does not have the (easy) ability to filter
# search result payloads by field, by default everything is returned. This is a problem if you
# have large PDF files, you could be sending your clients large amounts of text unnecessarily.
# If their search results are per-page, the search result payload from Solr is greatly limited.
# Secondly, with PDF files, storing the page number has an obvious benefit for client usability.
# If the client knows from the search result what page in the original document their highlighted
# search result is on, the client can create a link to that page, rather than a link to the whole
# document that the user must then search again with ctrl-F to get to where they were going. With
# this functionality in mind, if you read the script you'll note that metadata for each page is
# identical, but for the "_page_num_" and the "_text_". Do not rely on the PDF metadata's UUID for
# your document unique key. That won't work since each page (each document) has identical UUIDs
# copied from the original PDF. If you set up Solr in "schemaless" mode per the Apache documentation,
# you will have a separate "id" field from which Solr generates its own UUID for each document when
# the document is uplaoded. This will work fine with all of the above in mind, just make sure that
# your schema is not looking for the metadata UUID field to be unique; if it came from this script,
# it won't be.
#
# (2) File names and folder names are separated into search tags and appended as individual words.
# Any character other than a letter, dash, or period is a word delimiter.
#
# (3) In particular look at the sed command to add the trailing ] to the JSON on line 119. This will
# NOT work on Linux as expected. Run these seds unmodified outside of OSX or FreeBSD at your own risk!
#
##########################################################################################################
export PATH="$PATH"
export LANG="en_US.UTF-8"
export MM_CHARSET="en_US.UTF-8"
#####################################################
# You should define the following variables. Process
# block is what it suggests: processes that will block
# this script from running if they are running when
# this script is executed. You should not work on
# unknown PDF files while they are being written by
# your printer, processed by this file already,
# written by ghostscript, or uploaded by ftp.
# Incoming is the folder you upload to, process is a
# temp folder that this script writes temp files to,
# and backup is where the resulting processed files
# are stored after this script is done with them.
# User is the user that will run this script.
#####################################################
PROCESS_BLOCK="qpdf gs ocrmypdf sh"
INCOMING_DIR="$HOME/incoming"
PROCESS_DIR="$HOME/process"
BACKUP_DIR="$HOME/backup"
REJECT_DIR="$HOME/rejected"
THUMB_DIR="$HOME/stock_thumbs"
LOGFILE="$HOME/fileprocess.log"
TIKA_JAR="$HOME/bin/tika-app-1.18.jar"
USER="docserver"
#####################################################
# You should leave the variables below alone unless
# you know what you're doing.
#####################################################
STATUS=`pgrep -d , -U "$USER" $PROCESS_BLOCK`
LATEST_FILE=`find "$INCOMING_DIR" -maxdepth 1 -type f | sort | head -n 1`
EMPTY_PROCESS=`find "$PROCESS_DIR" -name '*.*'`
CLEAN_FILENAME=`basename "$LATEST_FILE" | sed 's/[^-._A-Za-z0-9]/_/g'`
CUT_FILENAME=`echo "$CLEAN_FILENAME" | sed 's/\.[^.]*$//'`
FILE_IS_PDF=`file "$LATEST_FILE" | grep PDF`
FILE_IS_OFFICEDOC=`file "$LATEST_FILE" | grep 'Excel\|Power\|Word\|OpenDocument\|Rich\|Text\|text'`
pdf_process() {
cat "$PROCESS_DIR"/meta.json >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"_tags_\":[\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"990s-2018\c" | sed 's/\_\_*/","/g' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"],\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
sed -i 's/_tags_\":\[\",\"/_tags_":["/g' "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"_page_num_\":\"$i\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\",\"_text_\":\"\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
cat "$PROCESS_DIR/$CUT_FILENAME-$CUR_NUM.pdf.txt" | tr '\n' ' ' | tr -d '\f' | sed -e 's/^ *//; s/[\]/\\\\/g; s/["]/\\"/g; s/ */ /g; s/ *$//' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"}\n," >> "$PROCESS_DIR"/"$CUT_FILENAME".json
}
officedoc_process() {
cat "$PROCESS_DIR"/meta.json >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"_tags_\":[\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "$LATEST_FILE\c" | sed 's#\/usr\/#/#; s#\/home\/cups\/incoming##g; s#\(.*\)/.*#\1#g; s#/#","#g' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\",\"$CUT_FILENAME\c" | sed 's/\_\_*/","/g' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"],\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
sed -i 's/_tags_\":\[\",\"/_tags_":["/g' "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"_page_num_\":\"1\",\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"_text_\":\"\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
java -jar "$TIKA_JAR" -T "$PROCESS_DIR"/"$CLEAN_FILENAME" | tr '\n' ' ' | tr -d '\f' | sed -e 's/^ *//; s/[\]/\\\\/g; s/["]/\\"/g; s/ */ /g; s/ *$//' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
echo "\"}\n," >> "$PROCESS_DIR"/"$CUT_FILENAME".json
}
clean_tempfiles() {
sed -i '$s/,/]/' "$PROCESS_DIR"/"$CUT_FILENAME".json
CURL_STATUS=$(curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/blacklight-core/update?commit=true' --silent --output /dev/stderr --write-out "%{http_code}" --data-binary @"$BACKUP_DIR"/"$CLEAN_FILENAME".json)
if [ "$CURL_STATUS" -ge 400 ]
then
mv "$LATEST_FILE" "$REJECT_DIR"/"$CLEAN_FILENAME".pdf
mv "$PROCESS_DIR"/"$CUT_FILENAME".json "$REJECT_DIR"/"$CLEAN_FILENAME".json
echo "Something went wrong while posting "$LATEST_FILE"\nto the search database. I have saved a copy of the relevant files\nin your rejected folder for you to inspect." >> "$LOGFILE"
rm -f "$PROCESS_DIR"/*.*
echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
exit 1
else
mv "$PROCESS_DIR"/"$CUT_FILENAME".json "$BACKUP_DIR"/"$CLEAN_FILENAME".json
rm -f "$PROCESS_DIR"/*.*
rm -f "$LATEST_FILE"
fi
}
error_file() {
mv "$LATEST_FILE" "$REJECT_DIR"/"$CLEAN_FILENAME"
echo "An error occurred while procesing the latest incoming\nfile. I have sanitized the filename but processed it no further.\nIt has been moved to the rejected folder.\n" >> "$LOGFILE"
echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
exit 1
}
not_document() {
mv "$LATEST_FILE" "$REJECT_DIR"/"$CLEAN_FILENAME"
echo "The latest file, "$LATEST_FILE", does not seem to be a\nPDF or Office document. I have sanitized the filename but processed it no further.\nIt has been moved to the rejected folder.\n" >> "$LOGFILE"
echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
exit 1
}
file_age() {
NOW_TIME=`date +%s`
FILE_TIME=`stat -f %c "$LATEST_FILE"`
FILE_AGE=`expr "$NOW_TIME" - "$FILE_TIME"`
if [ "$FILE_AGE" -le 5 ]
then
exit 0
else
echo "----------Started "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
fi
}
validate_pdf() {
TEST_VALID=$(qpdf --check "$LATEST_FILE")
if [ "$?" -eq 2 ]
then
error_file
elif [ "$?" -eq 0 -o "$?" -eq 1 ]
then
echo "PDF file seems valid, processing...\n" >> "$LOGFILE"
fi
}
if [ ! -z "$STATUS" ]
then
exit 0
elif [ -z "$LATEST_FILE" ]
then
exit 0
elif [ ! -z "$EMPTY_PROCESS" ]
then
exit 0
elif [ "$FILE_AGE" -le 5 ]
then
exit 0
elif [ ! -z "$FILE_IS_PDF" ]
then
file_age
validate_pdf
cp "$LATEST_FILE" "$PROCESS_DIR"/"$CLEAN_FILENAME"
java -jar "$TIKA_JAR" -j "$PROCESS_DIR"/"$CLEAN_FILENAME" | sed -e 's/[()]//g; s/.$/,/' > "$PROCESS_DIR"/meta.json
echo "\"_clean_fname_\":\"$CLEAN_FILENAME\",\c" >> "$PROCESS_DIR"/meta.json
echo "[" > "$PROCESS_DIR"/"$CUT_FILENAME".json
NUM_PAGES=`pdfinfo "$PROCESS_DIR"/"$CLEAN_FILENAME" | grep Pages | sed 's/[^0-9]//g'`
for i in $(seq 1 "$NUM_PAGES")
do
CUR_NUM=$(printf "%04d" "$i")
qpdf --empty --pages "$LATEST_FILE" "$i" -- "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf
TEXT_TEST=$(pdftotext -q -l 1 "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf -)
if [ "$TEXT_TEST" == $'\f' ]
then
ocrmypdf -q --force-ocr --deskew --clean --rotate-pages --output-type pdf --sidecar "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf.txt "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf
pdf_process
else
pdftotext "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf.txt
pdf_process
fi
done
rm -f "$PROCESS_DIR"/"$CLEAN_FILENAME"
qpdf --empty --pages "$PROCESS_DIR"/"$CUT_FILENAME"-[0-9][0-9][0-9][0-9].pdf -- "$PROCESS_DIR"/"$CLEAN_FILENAME"
exiftool -all:all= "$PROCESS_DIR"/"$CLEAN_FILENAME"
exiftool -P -tagsFromFile "$LATEST_FILE" "$PROCESS_DIR"/"$CLEAN_FILENAME"
qpdf -linearize --object-streams=generate "$PROCESS_DIR"/"$CLEAN_FILENAME" "$BACKUP_DIR"/"$CLEAN_FILENAME"
clean_tempfiles
echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
else
error_file
fi
exit 0
elif [ ! -z "$FILE_IS_OFFICEDOC" ]
then
file_age
cp "$LATEST_FILE" "$PROCESS_DIR"/"$CLEAN_FILENAME"
java -jar "$TIKA_JAR" -j "$PROCESS_DIR"/"$CLEAN_FILENAME" | sed -e 's/[()]//g; s/.$/,/' > "$PROCESS_DIR"/meta.json
echo "\"_clean_fname_\":\"$CLEAN_FILENAME\",\c" >> "$PROCESS_DIR"/meta.json
echo "[" > "$PROCESS_DIR"/"$CUT_FILENAME".json
officedoc_process
cp "$LATEST_FILE" "$BACKUP_DIR"/"$CLEAN_FILENAME"
IS_PRES=`file "$LATEST_FILE" | grep 'Presentation\|Power'`
IS_DOC=`file "$LATEST_FILE" | grep 'Word\|Rich\|Text\|text'`
IS_SHEET=`file "$LATEST_FILE" | grep 'Excel\|Spreadsheet'`
if [ ! -z "$IS_PRES" ]
then
cp "$THUMB_DIR"/present.png "$BACKUP_DIR"/"$CLEAN_FILENAME".png
elif [ ! -z "$IS_SHEET" ]
then
cp "$THUMB_DIR"/sheet.png "$BACKUP_DIR"/"$CLEAN_FILENAME".png
elif [ ! -z "$IS_DOC" ]
then
cp "$THUMB_DIR"/document.png "$BACKUP_DIR"/"$CLEAN_FILENAME".png
fi
clean_tempfiles
echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
exit 0
elif [ ! -z "$FILE_IS_OFFICEDOC" ] || [ ! -z "$FILE_IS_OFFICEDOC" ]
then
echo "----------Started "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
not_document
else
echo "----------Started "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
error_file
fi
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment