afrendeiro/make_validate_pdfa_muw.sh

## make_validate_pdfa_muw.sh
#!/usr/bin/bash

# This script details an example of how to create an validate a PDF/A
# compatible PDF file for submission as a thesi at the Medical University
# of Vienna

# It assumes in the working directory there are ${PAPER}.initial.pdf files
# that will make up your cumulative thesis

# The first cause of failure for PDF/A compatibility is the PDFs from your
# papers, specially if using documnets from journals or PIs

# It is therefore vital to use something to validate your existing PDFs
# before even starting. Good software for that is VeraPDF for example:
# https://docs.verapdf.org/
# Apache preflight, etc
# https://www.pdf-online.com/osa/validate.aspx
wget http://downloads.verapdf.org/rel/verapdf-installer.zip
unzip verapdf-installer.zip
# launch the GUI to install
./verapdf-*/verapdf-install

export PATH="${PATH}:verapdf/"


# Validate your paper pdfs
for PAPER in paper01.initial.pdf paper01_s.initial.pdf paper02.initial.pdf; do
verapdf -f 1b $PAPER
done

# In my case my PDFs did not pass validation, so I am completely discarding them
# and making new ones from rasterized PNGs:
for PAPER in paper01 paper01_s paper02; do
# Convert each page to PNG
ghostscript \
-sDEVICE=png16m \
-dTextAlphaBits=4 \
-r300 \
-dColorImageResolution=150 \
-o ${PAPER}.%03d.png ${PAPER}.initial.pdf

# Convert each page from PNG to PDF
for F in `ls ${PAPER}*.png`; do
convert -quality 20 $F ${F}.pdf
done

# Assemble PDFs in one multi-page
pdftk ${PAPER}*.png.pdf cat output ${PAPER}.pdf

# Optimize, reduce quality and make PDF/A compatible
ghostscript \
-dPDFA -dBATCH -dNOPAUSE \
-dColorImageResolution=150 \
-sProcessColorModel=DeviceCMYK \
-sDEVICE=pdfwrite -sPDFACompatibilityPolicy=1 \
-sOutputFile=${PAPER}.pdfa.pdf \
${PAPER}.pdf

rm ${PAPER}*.png ${PAPER}*.png.pdf
done

# Assemble papers with thesis
pdftk \
A=thesis.pdf \
B=paper01.pdfa.pdf \
C=paper01_s.pdfa.pdf \
D=paper02.pdfa.pdf \
cat A1-24 B1-end C1-end A25 D1-end A26-end \
output \
rendeiro.thesis_assembled.pdf

# We will now fix the broken structure of the bookmark index
# basically is just adding the number of pages your papers have:
# get index
pdftk thesis.pdf dump_data > info.txt

# replace with updated page numbers
sed -i 's/NumberOfPages: 46/NumberOfPages: 123/' info.txt
sed -i 's/BookmarkPageNumber: 25/BookmarkPageNumber: 57/' info.txt
sed -i 's/BookmarkPageNumber: 26/BookmarkPageNumber: 103/' info.txt
for I in {27..46}; do
sed -i "s/BookmarkPageNumber: ${I}/BookmarkPageNumber: $((I + 77))/" info.txt
done

# update index back to assembled PDF
pdftk \
rendeiro.thesis_assembled.pdf \
update_info \
info.txt \
output \
rendeiro.thesis_assembled.reindexed.pdf
rm info.txt

# convert to PDFA
ghostscript \
-dPDFA \
-dBATCH -dNOPAUSE \
-sProcessColorModel=DeviceCMYK \
-sColorConversionStrategy=UseDeviceIndependentColor \
-sDEVICE=pdfwrite \
-dPDFACompatibilityPolicy=1 \
-sOutputFile=rendeiro.thesis_assembled.reindexed.pdfa.pdf \
rendeiro.thesis_assembled.reindexed.pdf


# Validate the final PDF/A
verapdf \
-f 1b \
rendeiro.thesis_assembled.reindexed.pdfa.pdf

# Voila! You are a doctor!
	#!/usr/bin/bash

	# This script details an example of how to create an validate a PDF/A
	# compatible PDF file for submission as a thesi at the Medical University
	# of Vienna

	# It assumes in the working directory there are ${PAPER}.initial.pdf files
	# that will make up your cumulative thesis

	# The first cause of failure for PDF/A compatibility is the PDFs from your
	# papers, specially if using documnets from journals or PIs

	# It is therefore vital to use something to validate your existing PDFs
	# before even starting. Good software for that is VeraPDF for example:
	# https://docs.verapdf.org/
	# Apache preflight, etc
	# https://www.pdf-online.com/osa/validate.aspx
	wget http://downloads.verapdf.org/rel/verapdf-installer.zip
	unzip verapdf-installer.zip
	# launch the GUI to install
	./verapdf-*/verapdf-install

	export PATH="${PATH}:verapdf/"


	# Validate your paper pdfs
	for PAPER in paper01.initial.pdf paper01_s.initial.pdf paper02.initial.pdf; do
	verapdf -f 1b $PAPER
	done

	# In my case my PDFs did not pass validation, so I am completely discarding them
	# and making new ones from rasterized PNGs:
	for PAPER in paper01 paper01_s paper02; do
	# Convert each page to PNG
	ghostscript \
	-sDEVICE=png16m \
	-dTextAlphaBits=4 \
	-r300 \
	-dColorImageResolution=150 \
	-o ${PAPER}.%03d.png ${PAPER}.initial.pdf

	# Convert each page from PNG to PDF
	for F in `ls ${PAPER}*.png`; do
	convert -quality 20 $F ${F}.pdf
	done

	# Assemble PDFs in one multi-page
	pdftk ${PAPER}*.png.pdf cat output ${PAPER}.pdf

	# Optimize, reduce quality and make PDF/A compatible
	ghostscript \
	-dPDFA -dBATCH -dNOPAUSE \
	-dColorImageResolution=150 \
	-sProcessColorModel=DeviceCMYK \
	-sDEVICE=pdfwrite -sPDFACompatibilityPolicy=1 \
	-sOutputFile=${PAPER}.pdfa.pdf \
	${PAPER}.pdf

	rm ${PAPER}.png ${PAPER}.png.pdf
	done

	# Assemble papers with thesis
	pdftk \
	A=thesis.pdf \
	B=paper01.pdfa.pdf \
	C=paper01_s.pdfa.pdf \
	D=paper02.pdfa.pdf \
	cat A1-24 B1-end C1-end A25 D1-end A26-end \
	output \
	rendeiro.thesis_assembled.pdf

	# We will now fix the broken structure of the bookmark index
	# basically is just adding the number of pages your papers have:
	# get index
	pdftk thesis.pdf dump_data > info.txt

	# replace with updated page numbers
	sed -i 's/NumberOfPages: 46/NumberOfPages: 123/' info.txt
	sed -i 's/BookmarkPageNumber: 25/BookmarkPageNumber: 57/' info.txt
	sed -i 's/BookmarkPageNumber: 26/BookmarkPageNumber: 103/' info.txt
	for I in {27..46}; do
	sed -i "s/BookmarkPageNumber: ${I}/BookmarkPageNumber: $((I + 77))/" info.txt
	done

	# update index back to assembled PDF
	pdftk \
	rendeiro.thesis_assembled.pdf \
	update_info \
	info.txt \
	output \
	rendeiro.thesis_assembled.reindexed.pdf
	rm info.txt

	# convert to PDFA
	ghostscript \
	-dPDFA \
	-dBATCH -dNOPAUSE \
	-sProcessColorModel=DeviceCMYK \
	-sColorConversionStrategy=UseDeviceIndependentColor \
	-sDEVICE=pdfwrite \
	-dPDFACompatibilityPolicy=1 \
	-sOutputFile=rendeiro.thesis_assembled.reindexed.pdfa.pdf \
	rendeiro.thesis_assembled.reindexed.pdf


	# Validate the final PDF/A
	verapdf \
	-f 1b \
	rendeiro.thesis_assembled.reindexed.pdfa.pdf

	# Voila! You are a doctor!