Skip to content

Instantly share code, notes, and snippets.

@Romern
Last active November 3, 2023 15:27
Show Gist options
  • Save Romern/2dd6fe3be58cf7e71f7f87dee616ee6a to your computer and use it in GitHub Desktop.
Save Romern/2dd6fe3be58cf7e71f7f87dee616ee6a to your computer and use it in GitHub Desktop.
PostScript function to embed files in a PDF easily using pdfmark (by defaults dumps /tmp/* ) (e.g. ```gs -sDEVICE=pdfwrite -o foo.pdf embedfile.ps```). Python script extracts the files.
%!PS
%%%%%%%%%%%%%%%%%%%Helper Functions
% (a) (b) -> (ab)
/concatstrings { exch dup length
2 index length add string
dup dup 4 2 roll copy length
4 -1 roll putinterval
} bind def
%https://comp.lang.postscript.narkive.com/lb2y58U5/string-replace-in-postscript
% comp.lang.postscript FAQ 7.8:
% string1 string2 *append* string
% Function: Concatenates two strings together.
/append {
2 copy length exch length add % Find the length of the new.
string dup % string1 string2 string string
4 2 roll % string string string1 string2
2 index 0 3 index % string string string1 string2 ...
% ... string 0 string1
putinterval % Stuff the first string in.
% string string string1 string2
exch length exch putinterval
} bind def
% replace string find *findandreplaceall* string'
/findandreplaceall {
() 4 1 roll % string' is an empty string initially.
{
search { % string' replace post find pre
3 index append 5 -1 roll % replace post find tail' string'
exch append 4 1 roll % string' replace post find
} {
exch pop append
exit
} ifelse
} loop
} bind def
%%%%%%%%%%%%%%%%%%%Actual code
% Embeds the file located at parameter 1 into the PDF
% https://ghostscript.com/blog/zugferd.html
/EmbedFile {
/inputFileName exch def
% As pdfmark does not support dynamically generated objname's, generate the code dynamically and replace the name with a dynamically generated one
({) inputFileName concatstrings (Stream}) concatstrings (
[ /_objdef {InvoiceStream} /type /stream /OBJ pdfmark
[ {InvoiceStream} << /Type /EmbeddedFile /Subtype (application/octet-stream) cvn >> /PUT pdfmark
[ {InvoiceStream} inputFileName (r) file /PUT pdfmark
[ {InvoiceStream} /CLOSE pdfmark
[ /Name inputFileName /FS <<
/Type /FileSpec
/F inputFileName
/AFRelationship /Alternative
/EF << /F {InvoiceStream} >>
>> /EMBED pdfmark
) ({InvoiceStream}) findandreplaceall cvx exec
} def
(/tmp/*) { %filenameforall
{
EmbedFile
} stopped {} {} ifelse
} 4096 string filenameforall
# very slightly modified version of https://gist.github.com/kevinl95/29a9e18d474eb6e23372074deff2df38 with cmd arguments and by default no output
import PyPDF2
import sys
from pathlib import Path
def getAttachments(reader):
"""
Retrieves the file attachments of the PDF as a dictionary of file names
and the file data as a bytestring.
:return: dictionary of filenames and bytestrings
"""
catalog = reader.trailer["/Root"]
fileNames = catalog['/Names']['/EmbeddedFiles']['/Kids'][0].getObject()['/Names']
attachments = {}
for f in fileNames:
if isinstance(f, str):
name = f
dataIndex = fileNames.index(f) + 1
fDict = fileNames[dataIndex].getObject()
fData = fDict['/EF']['/F'].getData()
attachments[name] = fData
return attachments
handler = open(sys.argv[1], 'rb')
reader = PyPDF2.PdfFileReader(handler)
dictionary = getAttachments(reader)
# print(dictionary)
for fName, fData in dictionary.items():
path = Path.cwd() / ("." + str(Path(fName).resolve()))
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, 'wb') as outfile:
outfile.write(fData)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment