Created
September 27, 2019 00:42
-
-
Save afrendeiro/a772506a60fb75acae5a9979db224209 to your computer and use it in GitHub Desktop.
An example of how to create an validate a PDF/A compatible file for submission as a thesis at the Medical University of Vienna
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
# This script details an example of how to create an validate a PDF/A | |
# compatible PDF file for submission as a thesi at the Medical University | |
# of Vienna | |
# It assumes in the working directory there are ${PAPER}.initial.pdf files | |
# that will make up your cumulative thesis | |
# The first cause of failure for PDF/A compatibility is the PDFs from your | |
# papers, specially if using documnets from journals or PIs | |
# It is therefore vital to use something to validate your existing PDFs | |
# before even starting. Good software for that is VeraPDF for example: | |
# https://docs.verapdf.org/ | |
# Apache preflight, etc | |
# https://www.pdf-online.com/osa/validate.aspx | |
wget http://downloads.verapdf.org/rel/verapdf-installer.zip | |
unzip verapdf-installer.zip | |
# launch the GUI to install | |
./verapdf-*/verapdf-install | |
export PATH="${PATH}:verapdf/" | |
# Validate your paper pdfs | |
for PAPER in paper01.initial.pdf paper01_s.initial.pdf paper02.initial.pdf; do | |
verapdf -f 1b $PAPER | |
done | |
# In my case my PDFs did not pass validation, so I am completely discarding them | |
# and making new ones from rasterized PNGs: | |
for PAPER in paper01 paper01_s paper02; do | |
# Convert each page to PNG | |
ghostscript \ | |
-sDEVICE=png16m \ | |
-dTextAlphaBits=4 \ | |
-r300 \ | |
-dColorImageResolution=150 \ | |
-o ${PAPER}.%03d.png ${PAPER}.initial.pdf | |
# Convert each page from PNG to PDF | |
for F in `ls ${PAPER}*.png`; do | |
convert -quality 20 $F ${F}.pdf | |
done | |
# Assemble PDFs in one multi-page | |
pdftk ${PAPER}*.png.pdf cat output ${PAPER}.pdf | |
# Optimize, reduce quality and make PDF/A compatible | |
ghostscript \ | |
-dPDFA -dBATCH -dNOPAUSE \ | |
-dColorImageResolution=150 \ | |
-sProcessColorModel=DeviceCMYK \ | |
-sDEVICE=pdfwrite -sPDFACompatibilityPolicy=1 \ | |
-sOutputFile=${PAPER}.pdfa.pdf \ | |
${PAPER}.pdf | |
rm ${PAPER}*.png ${PAPER}*.png.pdf | |
done | |
# Assemble papers with thesis | |
pdftk \ | |
A=thesis.pdf \ | |
B=paper01.pdfa.pdf \ | |
C=paper01_s.pdfa.pdf \ | |
D=paper02.pdfa.pdf \ | |
cat A1-24 B1-end C1-end A25 D1-end A26-end \ | |
output \ | |
rendeiro.thesis_assembled.pdf | |
# We will now fix the broken structure of the bookmark index | |
# basically is just adding the number of pages your papers have: | |
# get index | |
pdftk thesis.pdf dump_data > info.txt | |
# replace with updated page numbers | |
sed -i 's/NumberOfPages: 46/NumberOfPages: 123/' info.txt | |
sed -i 's/BookmarkPageNumber: 25/BookmarkPageNumber: 57/' info.txt | |
sed -i 's/BookmarkPageNumber: 26/BookmarkPageNumber: 103/' info.txt | |
for I in {27..46}; do | |
sed -i "s/BookmarkPageNumber: ${I}/BookmarkPageNumber: $((I + 77))/" info.txt | |
done | |
# update index back to assembled PDF | |
pdftk \ | |
rendeiro.thesis_assembled.pdf \ | |
update_info \ | |
info.txt \ | |
output \ | |
rendeiro.thesis_assembled.reindexed.pdf | |
rm info.txt | |
# convert to PDFA | |
ghostscript \ | |
-dPDFA \ | |
-dBATCH -dNOPAUSE \ | |
-sProcessColorModel=DeviceCMYK \ | |
-sColorConversionStrategy=UseDeviceIndependentColor \ | |
-sDEVICE=pdfwrite \ | |
-dPDFACompatibilityPolicy=1 \ | |
-sOutputFile=rendeiro.thesis_assembled.reindexed.pdfa.pdf \ | |
rendeiro.thesis_assembled.reindexed.pdf | |
# Validate the final PDF/A | |
verapdf \ | |
-f 1b \ | |
rendeiro.thesis_assembled.reindexed.pdfa.pdf | |
# Voila! You are a doctor! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment