Skip to content

Instantly share code, notes, and snippets.

@giorgimode
Last active December 14, 2021 06:24
Show Gist options
  • Save giorgimode/279c3e651cbc9bd8204fea698312303b to your computer and use it in GitHub Desktop.
Save giorgimode/279c3e651cbc9bd8204fea698312303b to your computer and use it in GitHub Desktop.
Utility class to merge either multiple pdf files or images and pdfs and make them PDF/A compliant(if provided pdfs are also PDF/A compliant)
import com.google.common.io.Resources;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Calendar;
import java.util.List;
import javax.xml.transform.TransformerException;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.preflight.parser.PreflightParser;
import org.apache.xmpbox.XMPMetadata;
import org.apache.xmpbox.schema.DublinCoreSchema;
import org.apache.xmpbox.schema.PDFAIdentificationSchema;
import org.apache.xmpbox.schema.XMPBasicSchema;
import org.apache.xmpbox.type.BadFieldValueException;
import org.apache.xmpbox.xml.XmpSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
//test
public final class PDFMerger {
private static final Logger LOG = LoggerFactory.getLogger(PDFMerger.class);
private static final String OUTPUT_CONDITION_IDENTIFIER = "sRGB IEC61966-2.1";
public static final String DOCUMENT_CREATOR = "Mr. Meeseeks";
public static final String DOCUMENT_SUBJECT = "Great subject";
public static final String DOCUMENT_TITLE = "Here goes your title";
/**
* Creates a compound PDF document from a list of input documents.
* <p>
* The merged document is PDF/A-1b compliant, provided the source documents are as well
*
* @param sources list of source PDF document streams.
* @return compound PDF document as a readable input stream.
* @throws IOException if anything goes wrong during PDF merge.
*/
public static ByteArrayOutputStream mergeFiles(final List<InputStream> sources) throws IOException {
Path mergeDirectory = Files.createTempDirectory("merge-" + System.currentTimeMillis());
try (ByteArrayOutputStream mergedPDFOutputStream = new ByteArrayOutputStream()) {
LOG.debug("Merging {} source documents into one PDF", sources.size());
PDFMergerUtility mixedPdfMerger = createMixedPdfMerger(sources, mergedPDFOutputStream, mergeDirectory);
mergeFileStreams(mergedPDFOutputStream, mixedPdfMerger);
return mergedPDFOutputStream;
} catch (Exception e) {
if (!(e instanceof IOException)) {
throw new IOException("PDF merge problem", e);
}
throw (IOException) e;
} finally {
FileUtils.deleteDirectory(mergeDirectory.toFile());
sources.forEach(IOUtils::closeQuietly);
}
}
/**
* Creates a compound PDF document from a list of PDF documents.
* <p>
* The merged document is PDF/A-1b compliant
*
* @param sources list of source PDF document streams.
* @return compound PDF document as a readable input stream.
* @throws IOException if anything goes wrong during PDF merge.
*/
public static ByteArrayOutputStream mergePdfFiles(final List<InputStream> sources) throws IOException {
try (ByteArrayOutputStream mergedPDFOutputStream = new ByteArrayOutputStream()) {
LOG.debug("Merging {} source documents into one PDF", sources.size());
PDFMergerUtility pdfMerger = createPdfMerger(sources, mergedPDFOutputStream);
mergeFileStreams(mergedPDFOutputStream, pdfMerger);
return mergedPDFOutputStream;
} catch (Exception e) {
if (!(e instanceof IOException)) {
throw new IOException("PDF merge problem", e);
}
throw (IOException) e;
} finally {
sources.forEach(IOUtils::closeQuietly);
}
}
private static void mergeFileStreams(ByteArrayOutputStream mergedPDFOutputStream, PDFMergerUtility pdfMerger)
throws IOException, BadFieldValueException, TransformerException {
LOG.debug("Initialising PDF merge utility");
try (COSStream cosStream = new COSStream()) {
// PDF and XMP properties must be identical, otherwise document is not PDF/A compliant
pdfMerger.setDestinationDocumentInformation(createPDFDocumentInfo());
pdfMerger.setDestinationMetadata(createXMPMetadata(cosStream));
pdfMerger.mergeDocuments(MemoryUsageSetting.setupTempFileOnly());
LOG.debug("PDF merge successful, size = {} bytes", mergedPDFOutputStream.size());
}
}
@SuppressWarnings("UnstableApiUsage")
private static PDFMergerUtility createMixedPdfMerger(List<InputStream> sources, ByteArrayOutputStream mergedPDFOutputStream, Path mergeDirectory) throws IOException {
PDFMergerUtility pdfMerger = new PDFMergerUtility();
byte[] colorProfile = org.apache.commons.io.IOUtils.toByteArray(Resources.getResource("sRGB.icc"));
for (InputStream source : sources) {
File file = streamToFile(mergeDirectory, source);
if (isPdf(file)) {
pdfMerger.addSource(file);
} else {
pdfMerger.addSource(imageToPDDocument(mergeDirectory, file, colorProfile));
}
}
pdfMerger.setDestinationStream(mergedPDFOutputStream);
return pdfMerger;
}
private static PDFMergerUtility createPdfMerger(List<InputStream> documents, ByteArrayOutputStream mergedPDFOutputStream) {
PDFMergerUtility pdfMerger = new PDFMergerUtility();
pdfMerger.setDestinationStream(mergedPDFOutputStream);
documents.forEach(pdfMerger::addSource);
return pdfMerger;
}
private static PDDocumentInformation createPDFDocumentInfo() {
LOG.debug("Setting document info (title, author, subject) for merged PDF");
PDDocumentInformation documentInformation = new PDDocumentInformation();
documentInformation.setTitle(DOCUMENT_TITLE);
documentInformation.setCreator(DOCUMENT_CREATOR);
documentInformation.setSubject(DOCUMENT_SUBJECT);
return documentInformation;
}
private static PDMetadata createXMPMetadata(COSStream cosStream)
throws BadFieldValueException, TransformerException, IOException {
LOG.debug("Setting XMP metadata (title, author, subject) for merged PDF");
XMPMetadata xmpMetadata = XMPMetadata.createXMPMetadata();
// PDF/A-1b properties
PDFAIdentificationSchema pdfaSchema = xmpMetadata.createAndAddPFAIdentificationSchema();
pdfaSchema.setPart(1);
pdfaSchema.setConformance("B");
pdfaSchema.setAboutAsSimple("");
// Dublin Core properties
DublinCoreSchema dublinCoreSchema = xmpMetadata.createAndAddDublinCoreSchema();
dublinCoreSchema.setTitle(DOCUMENT_TITLE);
dublinCoreSchema.addCreator(DOCUMENT_CREATOR);
dublinCoreSchema.setDescription(DOCUMENT_SUBJECT);
// XMP Basic properties
XMPBasicSchema basicSchema = xmpMetadata.createAndAddXMPBasicSchema();
Calendar creationDate = Calendar.getInstance();
basicSchema.setCreateDate(creationDate);
basicSchema.setModifyDate(creationDate);
basicSchema.setMetadataDate(creationDate);
basicSchema.setCreatorTool(DOCUMENT_CREATOR);
// Create and return XMP data structure in XML format
try (ByteArrayOutputStream xmpOutputStream = new ByteArrayOutputStream();
OutputStream cosXMPStream = cosStream.createOutputStream()) {
new XmpSerializer().serialize(xmpMetadata, xmpOutputStream, true);
cosXMPStream.write(xmpOutputStream.toByteArray());
return new PDMetadata(cosStream);
}
}
private static File imageToPDDocument(Path mergeDirectory, File file, byte[] colorProfile) throws IOException {
try (PDDocument doc = new PDDocument()) {
PDImageXObject pdImage = PDImageXObject.createFromFileByContent(file, doc);
drawPage(doc, pdImage);
doc.getDocumentCatalog().addOutputIntent(createColorScheme(doc, colorProfile));
File pdfFile = Files.createTempFile(mergeDirectory, String.valueOf(System.currentTimeMillis()), ".tmp").toFile();
doc.save(pdfFile);
return pdfFile;
}
}
private static void drawPage(PDDocument doc, PDImageXObject pdImage) throws IOException {
PDPage page;
pdImage.getCOSObject().setItem(COSName.SMASK, COSName.NONE);
boolean isLandscapeMode = pdImage.getWidth() > pdImage.getHeight();
if (isLandscapeMode) {
page = new PDPage(new PDRectangle(PDRectangle.A4.getHeight(), PDRectangle.A4.getWidth()));
float scale = Math.min(Math.min(PDRectangle.A4.getWidth() / pdImage.getHeight(), PDRectangle.A4.getHeight() / pdImage.getWidth()), 1);
float width = pdImage.getWidth() * scale;
float height = pdImage.getHeight() * scale;
// center the image
float startWidth = (PDRectangle.A4.getHeight() - width) / 2;
float startHeight = (PDRectangle.A4.getWidth() - height) / 2;
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
contentStream.drawImage(pdImage, startWidth, startHeight, width, height);
}
} else {
page = new PDPage(PDRectangle.A4);
float scale = Math.min(Math.min(PDRectangle.A4.getWidth() / pdImage.getWidth(), PDRectangle.A4.getHeight() / pdImage.getHeight()), 1);
float width = pdImage.getWidth() * scale;
float height = pdImage.getHeight() * scale;
// try to center the image
float startWidth = (PDRectangle.A4.getWidth() - width) / 2;
float startHeight = (PDRectangle.A4.getHeight() - height) / 2;
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) {
contentStream.drawImage(pdImage, startWidth, startHeight, width, height);
}
}
doc.addPage(page);
}
private static PDOutputIntent createColorScheme(PDDocument doc, byte[] colorProfile) throws IOException {
PDOutputIntent intent = new PDOutputIntent(doc, new ByteArrayInputStream(colorProfile));
intent.setInfo(OUTPUT_CONDITION_IDENTIFIER);
intent.setOutputCondition(OUTPUT_CONDITION_IDENTIFIER);
intent.setOutputConditionIdentifier(OUTPUT_CONDITION_IDENTIFIER);
intent.setRegistryName("http://www.color.org");
return intent;
}
private static boolean isPdf(File file) {
try {
PreflightParser preflightParser = new PreflightParser(file);
preflightParser.parse();
return true;
} catch (Exception e) {
return false;
}
}
private static File streamToFile(Path tempDirectory, InputStream in) throws IOException {
final Path tempFile = Files.createTempFile(tempDirectory, String.valueOf(System.currentTimeMillis()), ".tmp");
try (FileOutputStream out = new FileOutputStream(tempFile.toFile())) {
IOUtils.copy(in, out);
}
return tempFile.toFile();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment