gordyt/pdf-ocr-txt

## pdf-ocr-txt
#!/usr/bin/env bash
# Prerequisites (all can be brew installed):
#   pdf2image tesseract

usage() {
    cat << EOF
usage: $0 file1 ...

Extract text from each (pdf) file and save it to a file with the same
name as the source file but with a .txt extension

Example:
- original file: sample.pdf
- new file: sample.txt
EOF
    exit 1
}

if [ $# -eq 0 ]; then
    usage
fi

convert() {
    inFile="$1"
    # shellcheck disable=SC2001
    outFile=$(echo "$inFile" | sed 's/^\(.*\)\(\.[^.]*\)$/\1.txt/')
    echo "OCR'ing text from $inFile and saving to $outFile"

    if ! [ -f "$inFile" ]; then
        >&2 echo "error: specified input pdf file '$inFile' not found"
        exit 1
    fi

    tmpDir=$(mktemp -d -t pdf)
    cp "$inFile" "$tmpDir/$inFile"
    pushd "$tmpDir" || exit 1

    pdf2image -zoom 3 "$inFile" JUNK
    for p in JUNK*.png; do tesseract "$p" "$p" -l eng txt; done
    popd || exit 1
    cat "$tmpDir"/JUNK*.txt > "$outFile"
    rm -rf "$tmpDir"
}


while [ $# -gt 0 ]; do
    convert "$1"
    shift
done
	#!/usr/bin/env bash
	# Prerequisites (all can be brew installed):
	# pdf2image tesseract

	usage() {
	cat << EOF
	usage: $0 file1 ...

	Extract text from each (pdf) file and save it to a file with the same
	name as the source file but with a .txt extension

	Example:
	- original file: sample.pdf
	- new file: sample.txt
	EOF
	exit 1
	}

	if [ $# -eq 0 ]; then
	usage
	fi

	convert() {
	inFile="$1"
	# shellcheck disable=SC2001
	outFile=$(echo "$inFile" \| sed 's/^\(.\)\(\.[^.]\)$/\1.txt/')
	echo "OCR'ing text from $inFile and saving to $outFile"

	if ! [ -f "$inFile" ]; then
	>&2 echo "error: specified input pdf file '$inFile' not found"
	exit 1
	fi

	tmpDir=$(mktemp -d -t pdf)
	cp "$inFile" "$tmpDir/$inFile"
	pushd "$tmpDir" \|\| exit 1

	pdf2image -zoom 3 "$inFile" JUNK
	for p in JUNK*.png; do tesseract "$p" "$p" -l eng txt; done
	popd \|\| exit 1
	cat "$tmpDir"/JUNK*.txt > "$outFile"
	rm -rf "$tmpDir"
	}


	while [ $# -gt 0 ]; do
	convert "$1"
	shift
	done