Skip to content

Instantly share code, notes, and snippets.

@doraTeX
Created July 5, 2023 08:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save doraTeX/bb77721474174628a144f24a65fa474b to your computer and use it in GitHub Desktop.
Save doraTeX/bb77721474174628a144f24a65fa474b to your computer and use it in GitHub Desktop.
A shell script to extract text from PDF on macOS
#!/bin/bash
SCRIPTNAME=$(basename "$0")
function realpath () {
f=$@;
if [ -d "$f" ]; then
base="";
dir="$f";
else
base="/$(basename "$f")";
dir=$(dirname "$f");
fi;
dir=$(cd "$dir" && /bin/pwd);
echo "$dir$base"
}
function PDF2Text () {
osascript \
-e 'use framework "Quartz"' \
-e "global CA" \
-e "set CA to current application" \
-e "on pdf2text(filePath)" \
-e "set doc to CA's PDFDocument's alloc's initWithURL:(CA's NSURL's fileURLWithPath:filePath)" \
-e "set pageCount to doc's pageCount" \
-e "set resultTexts to CA's NSMutableArray's new()" \
-e "repeat with i from 1 to pageCount" \
-e "(resultTexts's addObject:(((doc's pageAtIndex:(i - 1))'s attributedString)'s |string|))" \
-e "end repeat" \
-e "return (resultTexts's componentsJoinedByString:linefeed) as text" \
-e "end pdf2text" \
-e "set ocrResult to my pdf2text(\"$1\")" 2>/dev/null
}
function usage() {
echo "Usage: $SCRIPTNAME <PATH_TO_PDF_1> <PATH_TO_PDF_2> ..."
echo
echo "Options:"
echo " -h, --help Show help"
echo
}
# parse arguments
declare -a args=("$@")
declare -a params=()
I=0
while [ $I -lt ${#args[@]} ]; do
OPT="${args[$I]}"
case $OPT in
-h | --help )
usage
exit 0
;;
-- | -)
I=$(($I+1))
while [ $I -lt ${#args[@]} ]; do
params+=("${args[$I]}")
I=$(($I+1))
done
break
;;
-*)
echo "$SCRIPTNAME: illegal option -- '$(echo $OPT | sed 's/^-*//')'" 1>&2
exit 1
;;
*)
if [[ ! -z "$OPT" ]] && [[ ! "$OPT" =~ ^-+ ]]; then
params+=( "$OPT" )
fi
;;
esac
I=$(($I+1))
done
# handle invalid arguments
if [ ${#params[@]} -eq 0 ]; then
echo "$SCRIPTNAME: too few arguments" 1>&2
echo "Try '$SCRIPTNAME --help' for more information." 1>&2
exit 1
fi
for FILE in "${params[@]}"; do
PDF2Text "$(realpath $FILE)"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment