eheikes/textify.sh

## textify.sh
#!/bin/bash
#
# Converts images & PDF files to text.
# See http://ericheikes.com/converting-books-to-mp3-audio-text-to-speech
#

for file in "$@"
do
  extension="${file##*.}"
  basename=`basename "$file" .$extension`
  newfile="$basename.txt"

  # Convert to text.
  case "$extension" in
    pdf)
      pdftotext -enc UTF-8 -nopgbrk -layout "$file" "$newfile"
      ;;
    *)
      tesseract "$file" "$basename" -psm 1
      ;;
  esac

  # Replace non-ASCII characters with 7-bit ASCII equivalents.
  tempfile=`mktemp`
  uni2ascii -B "$newfile" > "$tempfile"
  mv "$tempfile" "$newfile"

  # OCR often interprets "w" as "vv",
  # and "vv" is an uncommon combination in English.
  sed -ri s/vv/w/ig "$newfile"

  # OCR sometimes interprets "&" as other characters.
  sed -ri 's/:5:/\&/ig' "$newfile"
  sed -ri 's/8c/\&/ig' "$newfile"

  # Some observed PDFs use "greek question mark" instead of semicolon.
  sed -ri s/0x037E/\;/ig "$newfile"
done
	#!/bin/bash
	#
	# Converts images & PDF files to text.
	# See http://ericheikes.com/converting-books-to-mp3-audio-text-to-speech
	#

	for file in "$@"
	do
	extension="${file##*.}"
	basename=`basename "$file" .$extension`
	newfile="$basename.txt"

	# Convert to text.
	case "$extension" in
	pdf)
	pdftotext -enc UTF-8 -nopgbrk -layout "$file" "$newfile"
	;;
	*)
	tesseract "$file" "$basename" -psm 1
	;;
	esac

	# Replace non-ASCII characters with 7-bit ASCII equivalents.
	tempfile=`mktemp`
	uni2ascii -B "$newfile" > "$tempfile"
	mv "$tempfile" "$newfile"

	# OCR often interprets "w" as "vv",
	# and "vv" is an uncommon combination in English.
	sed -ri s/vv/w/ig "$newfile"

	# OCR sometimes interprets "&" as other characters.
	sed -ri 's/:5:/\&/ig' "$newfile"
	sed -ri 's/8c/\&/ig' "$newfile"

	# Some observed PDFs use "greek question mark" instead of semicolon.
	sed -ri s/0x037E/\;/ig "$newfile"
	done