Last active
December 14, 2021 06:24
-
-
Save giorgimode/279c3e651cbc9bd8204fea698312303b to your computer and use it in GitHub Desktop.
Utility class to merge either multiple pdf files or images and pdfs and make them PDF/A compliant(if provided pdfs are also PDF/A compliant)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.google.common.io.Resources; | |
import java.io.ByteArrayInputStream; | |
import java.io.ByteArrayOutputStream; | |
import java.io.File; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.OutputStream; | |
import java.nio.file.Files; | |
import java.nio.file.Path; | |
import java.util.Calendar; | |
import java.util.List; | |
import javax.xml.transform.TransformerException; | |
import org.apache.commons.io.FileUtils; | |
import org.apache.pdfbox.cos.COSName; | |
import org.apache.pdfbox.cos.COSStream; | |
import org.apache.pdfbox.io.IOUtils; | |
import org.apache.pdfbox.io.MemoryUsageSetting; | |
import org.apache.pdfbox.multipdf.PDFMergerUtility; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.pdmodel.PDDocumentInformation; | |
import org.apache.pdfbox.pdmodel.PDPage; | |
import org.apache.pdfbox.pdmodel.PDPageContentStream; | |
import org.apache.pdfbox.pdmodel.common.PDMetadata; | |
import org.apache.pdfbox.pdmodel.common.PDRectangle; | |
import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent; | |
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; | |
import org.apache.pdfbox.preflight.parser.PreflightParser; | |
import org.apache.xmpbox.XMPMetadata; | |
import org.apache.xmpbox.schema.DublinCoreSchema; | |
import org.apache.xmpbox.schema.PDFAIdentificationSchema; | |
import org.apache.xmpbox.schema.XMPBasicSchema; | |
import org.apache.xmpbox.type.BadFieldValueException; | |
import org.apache.xmpbox.xml.XmpSerializer; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
//test | |
public final class PDFMerger { | |
private static final Logger LOG = LoggerFactory.getLogger(PDFMerger.class); | |
private static final String OUTPUT_CONDITION_IDENTIFIER = "sRGB IEC61966-2.1"; | |
public static final String DOCUMENT_CREATOR = "Mr. Meeseeks"; | |
public static final String DOCUMENT_SUBJECT = "Great subject"; | |
public static final String DOCUMENT_TITLE = "Here goes your title"; | |
/** | |
* Creates a compound PDF document from a list of input documents. | |
* <p> | |
* The merged document is PDF/A-1b compliant, provided the source documents are as well | |
* | |
* @param sources list of source PDF document streams. | |
* @return compound PDF document as a readable input stream. | |
* @throws IOException if anything goes wrong during PDF merge. | |
*/ | |
public static ByteArrayOutputStream mergeFiles(final List<InputStream> sources) throws IOException { | |
Path mergeDirectory = Files.createTempDirectory("merge-" + System.currentTimeMillis()); | |
try (ByteArrayOutputStream mergedPDFOutputStream = new ByteArrayOutputStream()) { | |
LOG.debug("Merging {} source documents into one PDF", sources.size()); | |
PDFMergerUtility mixedPdfMerger = createMixedPdfMerger(sources, mergedPDFOutputStream, mergeDirectory); | |
mergeFileStreams(mergedPDFOutputStream, mixedPdfMerger); | |
return mergedPDFOutputStream; | |
} catch (Exception e) { | |
if (!(e instanceof IOException)) { | |
throw new IOException("PDF merge problem", e); | |
} | |
throw (IOException) e; | |
} finally { | |
FileUtils.deleteDirectory(mergeDirectory.toFile()); | |
sources.forEach(IOUtils::closeQuietly); | |
} | |
} | |
/** | |
* Creates a compound PDF document from a list of PDF documents. | |
* <p> | |
* The merged document is PDF/A-1b compliant | |
* | |
* @param sources list of source PDF document streams. | |
* @return compound PDF document as a readable input stream. | |
* @throws IOException if anything goes wrong during PDF merge. | |
*/ | |
public static ByteArrayOutputStream mergePdfFiles(final List<InputStream> sources) throws IOException { | |
try (ByteArrayOutputStream mergedPDFOutputStream = new ByteArrayOutputStream()) { | |
LOG.debug("Merging {} source documents into one PDF", sources.size()); | |
PDFMergerUtility pdfMerger = createPdfMerger(sources, mergedPDFOutputStream); | |
mergeFileStreams(mergedPDFOutputStream, pdfMerger); | |
return mergedPDFOutputStream; | |
} catch (Exception e) { | |
if (!(e instanceof IOException)) { | |
throw new IOException("PDF merge problem", e); | |
} | |
throw (IOException) e; | |
} finally { | |
sources.forEach(IOUtils::closeQuietly); | |
} | |
} | |
private static void mergeFileStreams(ByteArrayOutputStream mergedPDFOutputStream, PDFMergerUtility pdfMerger) | |
throws IOException, BadFieldValueException, TransformerException { | |
LOG.debug("Initialising PDF merge utility"); | |
try (COSStream cosStream = new COSStream()) { | |
// PDF and XMP properties must be identical, otherwise document is not PDF/A compliant | |
pdfMerger.setDestinationDocumentInformation(createPDFDocumentInfo()); | |
pdfMerger.setDestinationMetadata(createXMPMetadata(cosStream)); | |
pdfMerger.mergeDocuments(MemoryUsageSetting.setupTempFileOnly()); | |
LOG.debug("PDF merge successful, size = {} bytes", mergedPDFOutputStream.size()); | |
} | |
} | |
@SuppressWarnings("UnstableApiUsage") | |
private static PDFMergerUtility createMixedPdfMerger(List<InputStream> sources, ByteArrayOutputStream mergedPDFOutputStream, Path mergeDirectory) throws IOException { | |
PDFMergerUtility pdfMerger = new PDFMergerUtility(); | |
byte[] colorProfile = org.apache.commons.io.IOUtils.toByteArray(Resources.getResource("sRGB.icc")); | |
for (InputStream source : sources) { | |
File file = streamToFile(mergeDirectory, source); | |
if (isPdf(file)) { | |
pdfMerger.addSource(file); | |
} else { | |
pdfMerger.addSource(imageToPDDocument(mergeDirectory, file, colorProfile)); | |
} | |
} | |
pdfMerger.setDestinationStream(mergedPDFOutputStream); | |
return pdfMerger; | |
} | |
private static PDFMergerUtility createPdfMerger(List<InputStream> documents, ByteArrayOutputStream mergedPDFOutputStream) { | |
PDFMergerUtility pdfMerger = new PDFMergerUtility(); | |
pdfMerger.setDestinationStream(mergedPDFOutputStream); | |
documents.forEach(pdfMerger::addSource); | |
return pdfMerger; | |
} | |
private static PDDocumentInformation createPDFDocumentInfo() { | |
LOG.debug("Setting document info (title, author, subject) for merged PDF"); | |
PDDocumentInformation documentInformation = new PDDocumentInformation(); | |
documentInformation.setTitle(DOCUMENT_TITLE); | |
documentInformation.setCreator(DOCUMENT_CREATOR); | |
documentInformation.setSubject(DOCUMENT_SUBJECT); | |
return documentInformation; | |
} | |
private static PDMetadata createXMPMetadata(COSStream cosStream) | |
throws BadFieldValueException, TransformerException, IOException { | |
LOG.debug("Setting XMP metadata (title, author, subject) for merged PDF"); | |
XMPMetadata xmpMetadata = XMPMetadata.createXMPMetadata(); | |
// PDF/A-1b properties | |
PDFAIdentificationSchema pdfaSchema = xmpMetadata.createAndAddPFAIdentificationSchema(); | |
pdfaSchema.setPart(1); | |
pdfaSchema.setConformance("B"); | |
pdfaSchema.setAboutAsSimple(""); | |
// Dublin Core properties | |
DublinCoreSchema dublinCoreSchema = xmpMetadata.createAndAddDublinCoreSchema(); | |
dublinCoreSchema.setTitle(DOCUMENT_TITLE); | |
dublinCoreSchema.addCreator(DOCUMENT_CREATOR); | |
dublinCoreSchema.setDescription(DOCUMENT_SUBJECT); | |
// XMP Basic properties | |
XMPBasicSchema basicSchema = xmpMetadata.createAndAddXMPBasicSchema(); | |
Calendar creationDate = Calendar.getInstance(); | |
basicSchema.setCreateDate(creationDate); | |
basicSchema.setModifyDate(creationDate); | |
basicSchema.setMetadataDate(creationDate); | |
basicSchema.setCreatorTool(DOCUMENT_CREATOR); | |
// Create and return XMP data structure in XML format | |
try (ByteArrayOutputStream xmpOutputStream = new ByteArrayOutputStream(); | |
OutputStream cosXMPStream = cosStream.createOutputStream()) { | |
new XmpSerializer().serialize(xmpMetadata, xmpOutputStream, true); | |
cosXMPStream.write(xmpOutputStream.toByteArray()); | |
return new PDMetadata(cosStream); | |
} | |
} | |
private static File imageToPDDocument(Path mergeDirectory, File file, byte[] colorProfile) throws IOException { | |
try (PDDocument doc = new PDDocument()) { | |
PDImageXObject pdImage = PDImageXObject.createFromFileByContent(file, doc); | |
drawPage(doc, pdImage); | |
doc.getDocumentCatalog().addOutputIntent(createColorScheme(doc, colorProfile)); | |
File pdfFile = Files.createTempFile(mergeDirectory, String.valueOf(System.currentTimeMillis()), ".tmp").toFile(); | |
doc.save(pdfFile); | |
return pdfFile; | |
} | |
} | |
private static void drawPage(PDDocument doc, PDImageXObject pdImage) throws IOException { | |
PDPage page; | |
pdImage.getCOSObject().setItem(COSName.SMASK, COSName.NONE); | |
boolean isLandscapeMode = pdImage.getWidth() > pdImage.getHeight(); | |
if (isLandscapeMode) { | |
page = new PDPage(new PDRectangle(PDRectangle.A4.getHeight(), PDRectangle.A4.getWidth())); | |
float scale = Math.min(Math.min(PDRectangle.A4.getWidth() / pdImage.getHeight(), PDRectangle.A4.getHeight() / pdImage.getWidth()), 1); | |
float width = pdImage.getWidth() * scale; | |
float height = pdImage.getHeight() * scale; | |
// center the image | |
float startWidth = (PDRectangle.A4.getHeight() - width) / 2; | |
float startHeight = (PDRectangle.A4.getWidth() - height) / 2; | |
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) { | |
contentStream.drawImage(pdImage, startWidth, startHeight, width, height); | |
} | |
} else { | |
page = new PDPage(PDRectangle.A4); | |
float scale = Math.min(Math.min(PDRectangle.A4.getWidth() / pdImage.getWidth(), PDRectangle.A4.getHeight() / pdImage.getHeight()), 1); | |
float width = pdImage.getWidth() * scale; | |
float height = pdImage.getHeight() * scale; | |
// try to center the image | |
float startWidth = (PDRectangle.A4.getWidth() - width) / 2; | |
float startHeight = (PDRectangle.A4.getHeight() - height) / 2; | |
try (PDPageContentStream contentStream = new PDPageContentStream(doc, page)) { | |
contentStream.drawImage(pdImage, startWidth, startHeight, width, height); | |
} | |
} | |
doc.addPage(page); | |
} | |
private static PDOutputIntent createColorScheme(PDDocument doc, byte[] colorProfile) throws IOException { | |
PDOutputIntent intent = new PDOutputIntent(doc, new ByteArrayInputStream(colorProfile)); | |
intent.setInfo(OUTPUT_CONDITION_IDENTIFIER); | |
intent.setOutputCondition(OUTPUT_CONDITION_IDENTIFIER); | |
intent.setOutputConditionIdentifier(OUTPUT_CONDITION_IDENTIFIER); | |
intent.setRegistryName("http://www.color.org"); | |
return intent; | |
} | |
private static boolean isPdf(File file) { | |
try { | |
PreflightParser preflightParser = new PreflightParser(file); | |
preflightParser.parse(); | |
return true; | |
} catch (Exception e) { | |
return false; | |
} | |
} | |
private static File streamToFile(Path tempDirectory, InputStream in) throws IOException { | |
final Path tempFile = Files.createTempFile(tempDirectory, String.valueOf(System.currentTimeMillis()), ".tmp"); | |
try (FileOutputStream out = new FileOutputStream(tempFile.toFile())) { | |
IOUtils.copy(in, out); | |
} | |
return tempFile.toFile(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment