Skip to content

Instantly share code, notes, and snippets.

@benizar
Last active February 14, 2022 18:48
Show Gist options
  • Save benizar/c150a476b103579b60ce6d19da4d2678 to your computer and use it in GitHub Desktop.
Save benizar/c150a476b103579b60ce6d19da4d2678 to your computer and use it in GitHub Desktop.
Extract images and text from a list of PDF files

Execute providing a list of files

bash pdf-break-down.sh folder/*
#!/bin/bash -e
#TODO: Ensure usability from any path
for FILE in "$@"
do
case "$FILE" in
*.pdf )
# it's a pdf
echo "Processing '$FILE'"
# Sanitize filename
FILENAME=$(basename "$FILE")
MAIN_DIR=$(echo "${FILENAME%.*}" \
| tr "[:upper:]" "[:lower:]" \
| tr 'áÁàÀãÃâÂéÉêÊíÍóÓõÕôÔúÚñÑçǪº' 'aAaAaAaAeEeEiIoOoOoOuUnNcCao' \
| tr -cd 'A-Za-z0-9_-')
RAWTEXT=$(echo "$MAIN_DIR" | tr '_' '-')
PAGE_DIR="$MAIN_DIR"/pages
IMAGE_DIR="$MAIN_DIR"/images
RAWTEXT_FILE=./$MAIN_DIR/$(echo "${FILENAME%.*}" | tr '_' '-').md
# Create a working directory
echo "Creating a working directory"
mkdir "$MAIN_DIR"
mkdir "$PAGE_DIR"
mkdir "$IMAGE_DIR"
echo "PDF Burst"
pdftk $FILE burst output "$PAGE_DIR"/page_%02d.pdf
# Append yaml header
cat << EOF >> $RAWTEXT_FILE
---
# Edit if you want to overwrite defaults
language: catalan
draft: false
title: Slides title
keywords: keyword1, keyword2, keyword3
#nocite: |
# @authorYearTopic
---
# Introducció
EOF
for PAGE in "$PAGE_DIR"/*.pdf
do
# Extract text
echo "Extracting raw text from PDF."
echo '' >> $RAWTEXT_FILE
echo '' >> $RAWTEXT_FILE
echo '## New slide' >> $RAWTEXT_FILE
echo '' >> $RAWTEXT_FILE
pdftotext -enc UTF-8 -layout $PAGE - >> $RAWTEXT_FILE
# Extract all images in PNG format
echo "Extracting all images in PNG format to '$IMAGE_DIR' (Page '$PAGE')"
pdfimages -png "$PAGE" ./$IMAGE_DIR/$(basename ${PAGE%.*})
a=($(pdfimages -list "$PAGE" | wc -l))
lines=${a[0]}
slidepics="${lines}"-2
#words=${a[1]}
#chars=${a[2]}
for ((n=0;n<${slidepics};n++))
do
echo -e "![]("$(basename ${PAGE%.*})-"${n}"".png)" >> $RAWTEXT_FILE
done
#mogrify -format png ./$FOLDER/*.ppm
# Remove intermediate images (ppm,pgm or pbm)
#find . -type f -not \(-name '*gz' -or -name '*odt' -or -name '*.jpg' \) -delete
#find ./"$FOLDER" -type f -not -name '*.png' -delete
done
# Clean text (interactive, regex)
sed -r 's/ //g' -i $RAWTEXT_FILE # Remove page breaks
sed -r 's///g' -i $RAWTEXT_FILE # Private use area
sed -r 's///g' -i $RAWTEXT_FILE # Private use area
# Remove consecutive characters
sed -r 's/ {2,}/ /g' -i $RAWTEXT_FILE
# Remove leading and trailing whitespace
sed 's/^[ \t]*//;s/[ \t]*$//' -i $RAWTEXT_FILE
# Markdown lists
sed -r 's/–/-/g' -i $RAWTEXT_FILE # Remove page breaks
sed -r "s/’/'/g" -i $RAWTEXT_FILE # Remove page breaks
sed -r "s/…/.../g" -i $RAWTEXT_FILE # Remove page breaks
#“”
# Insert a references section
echo '' >> $RAWTEXT_FILE
echo '## References' >> $RAWTEXT_FILE
echo '' >> $RAWTEXT_FILE
;;
*)
# it's not a pdf
echo "This is not a PDF: $FILE"
;;
esac
done
echo "Mission accomplished ;)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment