awhileback/fileprocess Secret

## fileprocess
#!/bin/sh

##########################################################################################################
#
#	Processor for turning PDF and Office files into JSONs for Solr.
#
#	This script requires the following:
#
#	- OCRmyPDF + Unpaper + Tesseract + Qpdf for text processing
#	- Poppler-Utils for testing PDF text and extracting text
#       - Exiftool for working with metadata
#	- Curl for talking to Tika and Solr servers
#	- Apache Tika app jar present on the system
#	- Apache Solr server running
#
#	This script also assumes the following, specific to my usage of Tika/Solr:
#
#	- Tika and Solr are listening on localhost
#	- A "_page_num_" field exists in your Solr schema (1)
#	- A "_tags_" multiValued field exists in your Solr schema (2)
#	- A "_clean_fname_" field exists in your Solr schema for sanitizzed filenames
#	- The "_text_" field in your Solr schema is used for document text
#	- sed commands are for POSIX (FreeBSD and OSX), they will require alteration for Linux (3)
#
#	(1) I store PDF text in Solr per page, rather than text for a whole document. There are two
#	resons for this. Firstly, as of this writing Solr does not have the (easy) ability to filter
#	search result payloads by field, by default everything is returned.  This is a problem if you
#	have large PDF files, you could be sending your clients large amounts of text unnecessarily.
#	If their search results are per-page, the search result payload from Solr is greatly limited.
#	Secondly, with PDF files, storing the page number has an obvious benefit for client usability.
#	If the client knows from the search result what page in the original document their highlighted
#	search result is on, the client can create a link to that page, rather than a link to the whole
#	document that the user must then search again with ctrl-F to get to where they were going. With
#	this functionality in mind, if you read the script you'll note that metadata for each page is
#	identical, but for the "_page_num_" and the "_text_". Do not rely on the PDF metadata's UUID for
#	your document unique key. That won't work since each page (each document) has identical UUIDs
#	copied from the original PDF. If you set up Solr in "schemaless" mode per the Apache documentation,
#	you will have a separate "id" field from which Solr generates its own UUID for each document when
#	the document is uplaoded. This will work fine with all of the above in mind, just make sure that
#	your schema is not looking for the metadata UUID field to be unique; if it came from this script,
#	it won't be.
#
#	(2) File names and folder names are separated into search tags and appended as individual words.
#	Any character other than a letter, dash, or period is a word delimiter.
#
#	(3) In particular look at the sed command to add the trailing ] to the JSON on line 119.  This will
#	NOT work on Linux as expected. Run these seds unmodified outside of OSX or FreeBSD at your own risk!
#
##########################################################################################################

export PATH="$PATH"
export LANG="en_US.UTF-8"
export MM_CHARSET="en_US.UTF-8"

#####################################################
# You should define the following variables. Process
# block is what it suggests: processes that will block
# this script from running if they are running when
# this script is executed. You should not work on
# unknown PDF files while they are being written by
# your printer, processed by this file already,
# written by ghostscript, or uploaded by ftp.
# Incoming is the folder you upload to, process is a
# temp folder that this script writes temp files to,
# and backup is where the resulting processed files
# are stored after this script is done with them.
# User is the user that will run this script.
#####################################################

PROCESS_BLOCK="qpdf gs ocrmypdf sh"
INCOMING_DIR="$HOME/incoming"
PROCESS_DIR="$HOME/process"
BACKUP_DIR="$HOME/backup"
REJECT_DIR="$HOME/rejected"
THUMB_DIR="$HOME/stock_thumbs"
LOGFILE="$HOME/fileprocess.log"
TIKA_JAR="$HOME/bin/tika-app-1.18.jar"
USER="docserver"

#####################################################
# You should leave the variables below alone unless
# you know what you're doing.
#####################################################

STATUS=`pgrep -d , -U "$USER" $PROCESS_BLOCK`
LATEST_FILE=`find "$INCOMING_DIR" -maxdepth 1 -type f | sort | head -n 1`
EMPTY_PROCESS=`find "$PROCESS_DIR" -name '*.*'`
CLEAN_FILENAME=`basename "$LATEST_FILE" | sed 's/[^-._A-Za-z0-9]/_/g'`
CUT_FILENAME=`echo "$CLEAN_FILENAME" | sed 's/\.[^.]*$//'`
FILE_IS_PDF=`file "$LATEST_FILE" | grep PDF`
FILE_IS_OFFICEDOC=`file "$LATEST_FILE" | grep 'Excel\|Power\|Word\|OpenDocument\|Rich\|Text\|text'`

pdf_process() {
    cat "$PROCESS_DIR"/meta.json >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"_tags_\":[\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"990s-2018\c" | sed 's/\_\_*/","/g' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"],\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    sed -i 's/_tags_\":\[\",\"/_tags_":["/g' "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"_page_num_\":\"$i\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\",\"_text_\":\"\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    cat "$PROCESS_DIR/$CUT_FILENAME-$CUR_NUM.pdf.txt" | tr '\n' ' ' | tr -d '\f' | sed -e 's/^ *//; s/[\]/\\\\/g; s/["]/\\"/g; s/  */ /g; s/ *$//' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"}\n," >> "$PROCESS_DIR"/"$CUT_FILENAME".json
}

officedoc_process() {
    cat "$PROCESS_DIR"/meta.json >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"_tags_\":[\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "$LATEST_FILE\c" | sed 's#\/usr\/#/#; s#\/home\/cups\/incoming##g; s#\(.*\)/.*#\1#g; s#/#","#g' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\",\"$CUT_FILENAME\c" | sed 's/\_\_*/","/g' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"],\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    sed -i 's/_tags_\":\[\",\"/_tags_":["/g' "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"_page_num_\":\"1\",\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"_text_\":\"\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    java -jar "$TIKA_JAR" -T "$PROCESS_DIR"/"$CLEAN_FILENAME" | tr '\n' ' ' | tr -d '\f' | sed -e 's/^ *//; s/[\]/\\\\/g; s/["]/\\"/g; s/  */ /g; s/ *$//' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
    echo "\"}\n," >> "$PROCESS_DIR"/"$CUT_FILENAME".json
}

clean_tempfiles() {
    sed -i '$s/,/]/' "$PROCESS_DIR"/"$CUT_FILENAME".json
    CURL_STATUS=$(curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/blacklight-core/update?commit=true' --silent --output /dev/stderr --write-out "%{http_code}" --data-binary @"$BACKUP_DIR"/"$CLEAN_FILENAME".json)
    if [ "$CURL_STATUS" -ge 400 ]
    then
	mv "$LATEST_FILE" "$REJECT_DIR"/"$CLEAN_FILENAME".pdf
	mv "$PROCESS_DIR"/"$CUT_FILENAME".json "$REJECT_DIR"/"$CLEAN_FILENAME".json
	echo "Something went wrong while posting "$LATEST_FILE"\nto the search database. I have saved a copy of the relevant files\nin your rejected folder for you to inspect." >> "$LOGFILE"
	rm -f "$PROCESS_DIR"/*.*
	echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
        exit 1
    else
	mv "$PROCESS_DIR"/"$CUT_FILENAME".json "$BACKUP_DIR"/"$CLEAN_FILENAME".json
	rm -f "$PROCESS_DIR"/*.*
        rm -f "$LATEST_FILE"
    fi
}

error_file() {
    mv "$LATEST_FILE" "$REJECT_DIR"/"$CLEAN_FILENAME"
    echo "An error occurred while procesing the latest incoming\nfile. I have sanitized the filename but processed it no further.\nIt has been moved to the rejected folder.\n" >> "$LOGFILE"
    echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
    exit 1
}

not_document() {
    mv "$LATEST_FILE" "$REJECT_DIR"/"$CLEAN_FILENAME"
    echo "The latest file, "$LATEST_FILE", does not seem to be a\nPDF or Office document. I have sanitized the filename but processed it no further.\nIt has been moved to the rejected folder.\n" >> "$LOGFILE"
    echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
    exit 1
}

file_age() {
    NOW_TIME=`date +%s`
    FILE_TIME=`stat -f %c "$LATEST_FILE"`
    FILE_AGE=`expr "$NOW_TIME" - "$FILE_TIME"`
    if [ "$FILE_AGE" -le 5 ]
    then
	exit 0
    else
	echo "----------Started "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
    fi
}

validate_pdf() {
    TEST_VALID=$(qpdf --check "$LATEST_FILE")
    if [ "$?" -eq 2 ]
    then
        error_file
    elif [ "$?" -eq 0 -o "$?" -eq 1 ]
    then
        echo "PDF file seems valid, processing...\n" >> "$LOGFILE"
    fi
}

if [ ! -z "$STATUS" ]
then
	exit 0
elif [ -z "$LATEST_FILE" ]
then
	exit 0
elif [ ! -z "$EMPTY_PROCESS" ]
then
	exit 0
elif [ "$FILE_AGE" -le 5 ]
then
	exit 0
elif [ ! -z "$FILE_IS_PDF" ]
then
        file_age
	validate_pdf
		cp "$LATEST_FILE" "$PROCESS_DIR"/"$CLEAN_FILENAME"
		java -jar "$TIKA_JAR" -j "$PROCESS_DIR"/"$CLEAN_FILENAME" | sed -e 's/[()]//g; s/.$/,/' > "$PROCESS_DIR"/meta.json
        	echo "\"_clean_fname_\":\"$CLEAN_FILENAME\",\c" >> "$PROCESS_DIR"/meta.json
		echo "[" > "$PROCESS_DIR"/"$CUT_FILENAME".json
		NUM_PAGES=`pdfinfo "$PROCESS_DIR"/"$CLEAN_FILENAME" | grep Pages | sed 's/[^0-9]//g'`
		for i in $(seq 1 "$NUM_PAGES")
		do
			CUR_NUM=$(printf "%04d" "$i")
			qpdf --empty --pages "$LATEST_FILE" "$i" -- "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf
			TEXT_TEST=$(pdftotext -q -l 1 "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf -)
			if [ "$TEXT_TEST" == $'\f' ]
			then
				ocrmypdf -q --force-ocr --deskew --clean --rotate-pages --output-type pdf --sidecar "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf.txt "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf
				pdf_process
			else
				pdftotext "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf.txt
				pdf_process
			fi
		done
		rm -f "$PROCESS_DIR"/"$CLEAN_FILENAME"
		qpdf --empty --pages "$PROCESS_DIR"/"$CUT_FILENAME"-[0-9][0-9][0-9][0-9].pdf -- "$PROCESS_DIR"/"$CLEAN_FILENAME"
		exiftool -all:all= "$PROCESS_DIR"/"$CLEAN_FILENAME"
		exiftool -P -tagsFromFile "$LATEST_FILE" "$PROCESS_DIR"/"$CLEAN_FILENAME"
		qpdf -linearize --object-streams=generate "$PROCESS_DIR"/"$CLEAN_FILENAME" "$BACKUP_DIR"/"$CLEAN_FILENAME"
		clean_tempfiles
		echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	else
		error_file
	fi
exit 0

elif [ ! -z "$FILE_IS_OFFICEDOC" ]
then
	file_age
	cp "$LATEST_FILE" "$PROCESS_DIR"/"$CLEAN_FILENAME"
	java -jar "$TIKA_JAR" -j "$PROCESS_DIR"/"$CLEAN_FILENAME" | sed -e 's/[()]//g; s/.$/,/' > "$PROCESS_DIR"/meta.json
        echo "\"_clean_fname_\":\"$CLEAN_FILENAME\",\c" >> "$PROCESS_DIR"/meta.json
        echo "[" > "$PROCESS_DIR"/"$CUT_FILENAME".json

	officedoc_process

	cp "$LATEST_FILE" "$BACKUP_DIR"/"$CLEAN_FILENAME"

	IS_PRES=`file "$LATEST_FILE" | grep 'Presentation\|Power'`
	IS_DOC=`file "$LATEST_FILE" | grep 'Word\|Rich\|Text\|text'`
	IS_SHEET=`file "$LATEST_FILE" | grep 'Excel\|Spreadsheet'`

	if [ ! -z "$IS_PRES" ]
	then
		cp "$THUMB_DIR"/present.png "$BACKUP_DIR"/"$CLEAN_FILENAME".png
	elif [ ! -z "$IS_SHEET" ]
	then
		cp "$THUMB_DIR"/sheet.png "$BACKUP_DIR"/"$CLEAN_FILENAME".png
	elif [ ! -z "$IS_DOC" ]
	then
		cp "$THUMB_DIR"/document.png "$BACKUP_DIR"/"$CLEAN_FILENAME".png
	fi

	clean_tempfiles
	echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
exit 0

elif [ ! -z "$FILE_IS_OFFICEDOC" ] || [ ! -z "$FILE_IS_OFFICEDOC" ]
then
	echo "----------Started "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	not_document
else
	echo "----------Started "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	error_file
fi
exit 0
	#!/bin/sh

	##########################################################################################################
	#
	# Processor for turning PDF and Office files into JSONs for Solr.
	#
	# This script requires the following:
	#
	# - OCRmyPDF + Unpaper + Tesseract + Qpdf for text processing
	# - Poppler-Utils for testing PDF text and extracting text
	# - Exiftool for working with metadata
	# - Curl for talking to Tika and Solr servers
	# - Apache Tika app jar present on the system
	# - Apache Solr server running
	#
	# This script also assumes the following, specific to my usage of Tika/Solr:
	#
	# - Tika and Solr are listening on localhost
	# - A "_page_num_" field exists in your Solr schema (1)
	# - A "_tags_" multiValued field exists in your Solr schema (2)
	# - A "_clean_fname_" field exists in your Solr schema for sanitizzed filenames
	# - The "_text_" field in your Solr schema is used for document text
	# - sed commands are for POSIX (FreeBSD and OSX), they will require alteration for Linux (3)
	#
	# (1) I store PDF text in Solr per page, rather than text for a whole document. There are two
	# resons for this. Firstly, as of this writing Solr does not have the (easy) ability to filter
	# search result payloads by field, by default everything is returned. This is a problem if you
	# have large PDF files, you could be sending your clients large amounts of text unnecessarily.
	# If their search results are per-page, the search result payload from Solr is greatly limited.
	# Secondly, with PDF files, storing the page number has an obvious benefit for client usability.
	# If the client knows from the search result what page in the original document their highlighted
	# search result is on, the client can create a link to that page, rather than a link to the whole
	# document that the user must then search again with ctrl-F to get to where they were going. With
	# this functionality in mind, if you read the script you'll note that metadata for each page is
	# identical, but for the "_page_num_" and the "_text_". Do not rely on the PDF metadata's UUID for
	# your document unique key. That won't work since each page (each document) has identical UUIDs
	# copied from the original PDF. If you set up Solr in "schemaless" mode per the Apache documentation,
	# you will have a separate "id" field from which Solr generates its own UUID for each document when
	# the document is uplaoded. This will work fine with all of the above in mind, just make sure that
	# your schema is not looking for the metadata UUID field to be unique; if it came from this script,
	# it won't be.
	#
	# (2) File names and folder names are separated into search tags and appended as individual words.
	# Any character other than a letter, dash, or period is a word delimiter.
	#
	# (3) In particular look at the sed command to add the trailing ] to the JSON on line 119. This will
	# NOT work on Linux as expected. Run these seds unmodified outside of OSX or FreeBSD at your own risk!
	#
	##########################################################################################################

	export PATH="$PATH"
	export LANG="en_US.UTF-8"
	export MM_CHARSET="en_US.UTF-8"

	#####################################################
	# You should define the following variables. Process
	# block is what it suggests: processes that will block
	# this script from running if they are running when
	# this script is executed. You should not work on
	# unknown PDF files while they are being written by
	# your printer, processed by this file already,
	# written by ghostscript, or uploaded by ftp.
	# Incoming is the folder you upload to, process is a
	# temp folder that this script writes temp files to,
	# and backup is where the resulting processed files
	# are stored after this script is done with them.
	# User is the user that will run this script.
	#####################################################

	PROCESS_BLOCK="qpdf gs ocrmypdf sh"
	INCOMING_DIR="$HOME/incoming"
	PROCESS_DIR="$HOME/process"
	BACKUP_DIR="$HOME/backup"
	REJECT_DIR="$HOME/rejected"
	THUMB_DIR="$HOME/stock_thumbs"
	LOGFILE="$HOME/fileprocess.log"
	TIKA_JAR="$HOME/bin/tika-app-1.18.jar"
	USER="docserver"

	#####################################################
	# You should leave the variables below alone unless
	# you know what you're doing.
	#####################################################

	STATUS=`pgrep -d , -U "$USER" $PROCESS_BLOCK`
	LATEST_FILE=`find "$INCOMING_DIR" -maxdepth 1 -type f \| sort \| head -n 1`
	EMPTY_PROCESS=`find "$PROCESS_DIR" -name '.'`
	CLEAN_FILENAME=`basename "$LATEST_FILE" \| sed 's/[^-._A-Za-z0-9]/_/g'`
	CUT_FILENAME=`echo "$CLEAN_FILENAME" \| sed 's/\.[^.]*$//'`
	FILE_IS_PDF=`file "$LATEST_FILE" \| grep PDF`
	FILE_IS_OFFICEDOC=`file "$LATEST_FILE" \| grep 'Excel\\|Power\\|Word\\|OpenDocument\\|Rich\\|Text\\|text'`

	pdf_process() {
	cat "$PROCESS_DIR"/meta.json >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"_tags_\":[\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"990s-2018\c" \| sed 's/\_\_*/","/g' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"],\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	sed -i 's/_tags_\":\[\",\"/_tags_":["/g' "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"_page_num_\":\"$i\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\",\"_text_\":\"\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	cat "$PROCESS_DIR/$CUT_FILENAME-$CUR_NUM.pdf.txt" \| tr '\n' ' ' \| tr -d '\f' \| sed -e 's/^ //; s/[\]/\\\\/g; s/["]/\\"/g; s/ / /g; s/ *$//' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"}\n," >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	}

	officedoc_process() {
	cat "$PROCESS_DIR"/meta.json >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"_tags_\":[\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "$LATEST_FILE\c" \| sed 's#\/usr\/#/#; s#\/home\/cups\/incoming##g; s#\(.\)/.#\1#g; s#/#","#g' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\",\"$CUT_FILENAME\c" \| sed 's/\_\_*/","/g' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"],\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	sed -i 's/_tags_\":\[\",\"/_tags_":["/g' "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"_page_num_\":\"1\",\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"_text_\":\"\c" >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	java -jar "$TIKA_JAR" -T "$PROCESS_DIR"/"$CLEAN_FILENAME" \| tr '\n' ' ' \| tr -d '\f' \| sed -e 's/^ //; s/[\]/\\\\/g; s/["]/\\"/g; s/ / /g; s/ *$//' >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	echo "\"}\n," >> "$PROCESS_DIR"/"$CUT_FILENAME".json
	}

	clean_tempfiles() {
	sed -i '$s/,/]/' "$PROCESS_DIR"/"$CUT_FILENAME".json
	CURL_STATUS=$(curl -X POST -H 'Content-Type: application/json' 'http://localhost:8983/solr/blacklight-core/update?commit=true' --silent --output /dev/stderr --write-out "%{http_code}" --data-binary @"$BACKUP_DIR"/"$CLEAN_FILENAME".json)
	if [ "$CURL_STATUS" -ge 400 ]
	then
	mv "$LATEST_FILE" "$REJECT_DIR"/"$CLEAN_FILENAME".pdf
	mv "$PROCESS_DIR"/"$CUT_FILENAME".json "$REJECT_DIR"/"$CLEAN_FILENAME".json
	echo "Something went wrong while posting "$LATEST_FILE"\nto the search database. I have saved a copy of the relevant files\nin your rejected folder for you to inspect." >> "$LOGFILE"
	rm -f "$PROCESS_DIR"/.
	echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	exit 1
	else
	mv "$PROCESS_DIR"/"$CUT_FILENAME".json "$BACKUP_DIR"/"$CLEAN_FILENAME".json
	rm -f "$PROCESS_DIR"/.
	rm -f "$LATEST_FILE"
	fi
	}

	error_file() {
	mv "$LATEST_FILE" "$REJECT_DIR"/"$CLEAN_FILENAME"
	echo "An error occurred while procesing the latest incoming\nfile. I have sanitized the filename but processed it no further.\nIt has been moved to the rejected folder.\n" >> "$LOGFILE"
	echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	exit 1
	}

	not_document() {
	mv "$LATEST_FILE" "$REJECT_DIR"/"$CLEAN_FILENAME"
	echo "The latest file, "$LATEST_FILE", does not seem to be a\nPDF or Office document. I have sanitized the filename but processed it no further.\nIt has been moved to the rejected folder.\n" >> "$LOGFILE"
	echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	exit 1
	}

	file_age() {
	NOW_TIME=`date +%s`
	FILE_TIME=`stat -f %c "$LATEST_FILE"`
	FILE_AGE=`expr "$NOW_TIME" - "$FILE_TIME"`
	if [ "$FILE_AGE" -le 5 ]
	then
	exit 0
	else
	echo "----------Started "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	fi
	}

	validate_pdf() {
	TEST_VALID=$(qpdf --check "$LATEST_FILE")
	if [ "$?" -eq 2 ]
	then
	error_file
	elif [ "$?" -eq 0 -o "$?" -eq 1 ]
	then
	echo "PDF file seems valid, processing...\n" >> "$LOGFILE"
	fi
	}

	if [ ! -z "$STATUS" ]
	then
	exit 0
	elif [ -z "$LATEST_FILE" ]
	then
	exit 0
	elif [ ! -z "$EMPTY_PROCESS" ]
	then
	exit 0
	elif [ "$FILE_AGE" -le 5 ]
	then
	exit 0
	elif [ ! -z "$FILE_IS_PDF" ]
	then
	file_age
	validate_pdf
	cp "$LATEST_FILE" "$PROCESS_DIR"/"$CLEAN_FILENAME"
	java -jar "$TIKA_JAR" -j "$PROCESS_DIR"/"$CLEAN_FILENAME" \| sed -e 's/[()]//g; s/.$/,/' > "$PROCESS_DIR"/meta.json
	echo "\"_clean_fname_\":\"$CLEAN_FILENAME\",\c" >> "$PROCESS_DIR"/meta.json
	echo "[" > "$PROCESS_DIR"/"$CUT_FILENAME".json
	NUM_PAGES=`pdfinfo "$PROCESS_DIR"/"$CLEAN_FILENAME" \| grep Pages \| sed 's/[^0-9]//g'`
	for i in $(seq 1 "$NUM_PAGES")
	do
	CUR_NUM=$(printf "%04d" "$i")
	qpdf --empty --pages "$LATEST_FILE" "$i" -- "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf
	TEXT_TEST=$(pdftotext -q -l 1 "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf -)
	if [ "$TEXT_TEST" == $'\f' ]
	then
	ocrmypdf -q --force-ocr --deskew --clean --rotate-pages --output-type pdf --sidecar "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf.txt "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf
	pdf_process
	else
	pdftotext "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf "$PROCESS_DIR"/"$CUT_FILENAME"-"$CUR_NUM".pdf.txt
	pdf_process
	fi
	done
	rm -f "$PROCESS_DIR"/"$CLEAN_FILENAME"
	qpdf --empty --pages "$PROCESS_DIR"/"$CUT_FILENAME"-[0-9][0-9][0-9][0-9].pdf -- "$PROCESS_DIR"/"$CLEAN_FILENAME"
	exiftool -all:all= "$PROCESS_DIR"/"$CLEAN_FILENAME"
	exiftool -P -tagsFromFile "$LATEST_FILE" "$PROCESS_DIR"/"$CLEAN_FILENAME"
	qpdf -linearize --object-streams=generate "$PROCESS_DIR"/"$CLEAN_FILENAME" "$BACKUP_DIR"/"$CLEAN_FILENAME"
	clean_tempfiles
	echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	else
	error_file
	fi
	exit 0

	elif [ ! -z "$FILE_IS_OFFICEDOC" ]
	then
	file_age
	cp "$LATEST_FILE" "$PROCESS_DIR"/"$CLEAN_FILENAME"
	java -jar "$TIKA_JAR" -j "$PROCESS_DIR"/"$CLEAN_FILENAME" \| sed -e 's/[()]//g; s/.$/,/' > "$PROCESS_DIR"/meta.json
	echo "\"_clean_fname_\":\"$CLEAN_FILENAME\",\c" >> "$PROCESS_DIR"/meta.json
	echo "[" > "$PROCESS_DIR"/"$CUT_FILENAME".json

	officedoc_process

	cp "$LATEST_FILE" "$BACKUP_DIR"/"$CLEAN_FILENAME"

	IS_PRES=`file "$LATEST_FILE" \| grep 'Presentation\\|Power'`
	IS_DOC=`file "$LATEST_FILE" \| grep 'Word\\|Rich\\|Text\\|text'`
	IS_SHEET=`file "$LATEST_FILE" \| grep 'Excel\\|Spreadsheet'`

	if [ ! -z "$IS_PRES" ]
	then
	cp "$THUMB_DIR"/present.png "$BACKUP_DIR"/"$CLEAN_FILENAME".png
	elif [ ! -z "$IS_SHEET" ]
	then
	cp "$THUMB_DIR"/sheet.png "$BACKUP_DIR"/"$CLEAN_FILENAME".png
	elif [ ! -z "$IS_DOC" ]
	then
	cp "$THUMB_DIR"/document.png "$BACKUP_DIR"/"$CLEAN_FILENAME".png
	fi

	clean_tempfiles
	echo "\n----------Finished "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	exit 0

	elif [ ! -z "$FILE_IS_OFFICEDOC" ] \|\| [ ! -z "$FILE_IS_OFFICEDOC" ]
	then
	echo "----------Started "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	not_document
	else
	echo "----------Started "$LATEST_FILE" on "$(date)"----------\n" >> "$LOGFILE"
	error_file
	fi
	exit 0