benizar/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Execute providing a list of files

bash pdf-break-down.sh folder/*

  
## _pdf-break-down.sh
#!/bin/bash -e

#TODO: Ensure usability from any path

for FILE in "$@"
do
  case "$FILE" in
  *.pdf )
      # it's a pdf
      echo "Processing '$FILE'"

      # Sanitize filename
      FILENAME=$(basename "$FILE")
      MAIN_DIR=$(echo "${FILENAME%.*}" \
        | tr "[:upper:]" "[:lower:]" \
        | tr 'áÁàÀãÃâÂéÉêÊíÍóÓõÕôÔúÚñÑçÇªº' 'aAaAaAaAeEeEiIoOoOoOuUnNcCao' \
        | tr -cd 'A-Za-z0-9_-')
      RAWTEXT=$(echo "$MAIN_DIR" | tr '_' '-')

      PAGE_DIR="$MAIN_DIR"/pages
      IMAGE_DIR="$MAIN_DIR"/images

      RAWTEXT_FILE=./$MAIN_DIR/$(echo "${FILENAME%.*}" | tr '_' '-').md

      # Create a working directory
      echo "Creating a working directory"
      mkdir "$MAIN_DIR"
      mkdir "$PAGE_DIR"
      mkdir "$IMAGE_DIR"

      echo "PDF Burst"
      pdftk $FILE burst output "$PAGE_DIR"/page_%02d.pdf


      # Append yaml header
cat << EOF >> $RAWTEXT_FILE
---
# Edit if you want to overwrite defaults
language: catalan
draft: false

title: Slides title
keywords: keyword1, keyword2, keyword3

#nocite: |
#  @authorYearTopic
---

# Introducció

EOF


      for PAGE in "$PAGE_DIR"/*.pdf
      do

        # Extract text
        echo "Extracting raw text from PDF."
        echo '' >> $RAWTEXT_FILE
        echo '' >> $RAWTEXT_FILE
        echo '## New slide' >> $RAWTEXT_FILE
        echo '' >> $RAWTEXT_FILE
        pdftotext -enc UTF-8 -layout $PAGE - >> $RAWTEXT_FILE

        # Extract all images in PNG format
        echo "Extracting all images in PNG format to '$IMAGE_DIR' (Page '$PAGE')"
        pdfimages -png "$PAGE" ./$IMAGE_DIR/$(basename ${PAGE%.*})


        a=($(pdfimages -list "$PAGE" | wc -l))
        lines=${a[0]}
        slidepics="${lines}"-2
        #words=${a[1]}
        #chars=${a[2]}

        for ((n=0;n<${slidepics};n++))
        do
          echo -e "![]("$(basename ${PAGE%.*})-"${n}"".png)" >> $RAWTEXT_FILE
        done


        #mogrify -format png ./$FOLDER/*.ppm

        # Remove intermediate images (ppm,pgm or pbm)
        #find . -type f -not \(-name '*gz' -or -name '*odt' -or -name '*.jpg' \) -delete
        #find ./"$FOLDER" -type f -not -name '*.png' -delete

      done

      # Clean text (interactive, regex)
      sed -r 's///g' -i $RAWTEXT_FILE # Remove page breaks
      sed -r 's///g' -i $RAWTEXT_FILE # Private use area
      sed -r 's///g' -i $RAWTEXT_FILE # Private use area

      # Remove consecutive characters
      sed -r 's/ {2,}/ /g' -i $RAWTEXT_FILE
      # Remove leading and trailing whitespace
      sed 's/^[ \t]*//;s/[ \t]*$//' -i $RAWTEXT_FILE

      # Markdown lists
      sed -r 's/–/-/g' -i $RAWTEXT_FILE # Remove page breaks
      sed -r "s/’/'/g" -i $RAWTEXT_FILE # Remove page breaks
      sed -r "s/…/.../g" -i $RAWTEXT_FILE # Remove page breaks

      #“”


      # Insert a references section
      echo '' >> $RAWTEXT_FILE
      echo '## References' >> $RAWTEXT_FILE
      echo '' >> $RAWTEXT_FILE

    ;;
    *)
      # it's not a pdf
      echo "This is not a PDF: $FILE"
    ;;
  esac

done


echo "Mission accomplished ;)"
	#!/bin/bash -e

	#TODO: Ensure usability from any path

	for FILE in "$@"
	do
	case "$FILE" in
	*.pdf )
	# it's a pdf
	echo "Processing '$FILE'"

	# Sanitize filename
	FILENAME=$(basename "$FILE")
	MAIN_DIR=$(echo "${FILENAME%.*}" \
	\| tr "[:upper:]" "[:lower:]" \
	\| tr 'áÁàÀãÃâÂéÉêÊíÍóÓõÕôÔúÚñÑçÇªº' 'aAaAaAaAeEeEiIoOoOoOuUnNcCao' \
	\| tr -cd 'A-Za-z0-9_-')
	RAWTEXT=$(echo "$MAIN_DIR" \| tr '_' '-')

	PAGE_DIR="$MAIN_DIR"/pages
	IMAGE_DIR="$MAIN_DIR"/images

	RAWTEXT_FILE=./$MAIN_DIR/$(echo "${FILENAME%.*}" \| tr '_' '-').md

	# Create a working directory
	echo "Creating a working directory"
	mkdir "$MAIN_DIR"
	mkdir "$PAGE_DIR"
	mkdir "$IMAGE_DIR"

	echo "PDF Burst"
	pdftk $FILE burst output "$PAGE_DIR"/page_%02d.pdf


	# Append yaml header
	cat << EOF >> $RAWTEXT_FILE
	---
	# Edit if you want to overwrite defaults
	language: catalan
	draft: false

	title: Slides title
	keywords: keyword1, keyword2, keyword3

	#nocite: \|
	# @authorYearTopic
	---

	# Introducció

	EOF



	for PAGE in "$PAGE_DIR"/*.pdf
	do

	# Extract text
	echo "Extracting raw text from PDF."
	echo '' >> $RAWTEXT_FILE
	echo '' >> $RAWTEXT_FILE
	echo '## New slide' >> $RAWTEXT_FILE
	echo '' >> $RAWTEXT_FILE
	pdftotext -enc UTF-8 -layout $PAGE - >> $RAWTEXT_FILE

	# Extract all images in PNG format
	echo "Extracting all images in PNG format to '$IMAGE_DIR' (Page '$PAGE')"
	pdfimages -png "$PAGE" ./$IMAGE_DIR/$(basename ${PAGE%.*})


	a=($(pdfimages -list "$PAGE" \| wc -l))
	lines=${a[0]}
	slidepics="${lines}"-2
	#words=${a[1]}
	#chars=${a[2]}

	for ((n=0;n<${slidepics};n++))
	do
	echo -e "![]("$(basename ${PAGE%.*})-"${n}"".png)" >> $RAWTEXT_FILE
	done


	#mogrify -format png ./$FOLDER/*.ppm

	# Remove intermediate images (ppm,pgm or pbm)
	#find . -type f -not \(-name 'gz' -or -name 'odt' -or -name '*.jpg' \) -delete
	#find ./"$FOLDER" -type f -not -name '*.png' -delete

	done

	# Clean text (interactive, regex)
	sed -r 's///g' -i $RAWTEXT_FILE # Remove page breaks
	sed -r 's///g' -i $RAWTEXT_FILE # Private use area
	sed -r 's///g' -i $RAWTEXT_FILE # Private use area

	# Remove consecutive characters
	sed -r 's/ {2,}/ /g' -i $RAWTEXT_FILE
	# Remove leading and trailing whitespace
	sed 's/^[ \t]//;s/[ \t]$//' -i $RAWTEXT_FILE

	# Markdown lists
	sed -r 's/–/-/g' -i $RAWTEXT_FILE # Remove page breaks
	sed -r "s/’/'/g" -i $RAWTEXT_FILE # Remove page breaks
	sed -r "s/…/.../g" -i $RAWTEXT_FILE # Remove page breaks

	#“”


	# Insert a references section
	echo '' >> $RAWTEXT_FILE
	echo '## References' >> $RAWTEXT_FILE
	echo '' >> $RAWTEXT_FILE

	;;
	*)
	# it's not a pdf
	echo "This is not a PDF: $FILE"
	;;
	esac

	done


	echo "Mission accomplished ;)"