Skip to content

Instantly share code, notes, and snippets.

@hyzyla
Last active February 16, 2019 23:35
Show Gist options
  • Save hyzyla/242d8e3ab5c1355f5212c676cc895f82 to your computer and use it in GitHub Desktop.
Save hyzyla/242d8e3ab5c1355f5212c676cc895f82 to your computer and use it in GitHub Desktop.
Function that removes any trash in PDF after last %%EOF marker
import io
import os
EOF_REVERSED = b'%%EOF'[::-1]
def fix_PDF_EOF(content: io.BytesIO) -> io.BytesIO:
""" Truncate data from last %%EOF in given PDF """
if not content.seekable():
raise ValueError('Content must support random access')
initial_position = content.tell()
# move stream position to the end
content.seek(0, os.SEEK_END)
# initialize values
buffer = b''
next_byte = EOF_REVERSED[len(buffer): len(buffer) + 1]
position = resize_position = content.tell()
while position > 0:
position -= 1
content.seek(position)
current_byte = content.read(1)
if next_byte == current_byte:
# append byte to buffer
buffer += current_byte
if len(buffer) == len(EOF_REVERSED):
resize_position = position
break
else:
buffer = b''
next_byte = EOF_REVERSED[len(buffer): len(buffer) + 1]
# truncate data after %%EOF
content.truncate(resize_position + len(EOF_REVERSED))
# move stream to initial position
content.seek(initial_position)
return content
# tests.py =====================================================
import io
import pytest
@pytest.mark.parametrize('content, expected', [
(b'%%EOF', b'%%EOF'),
(b'%%EOF ', b'%%EOF'),
(b' %%EOF ', b' %%EOF'),
(b' %%EOF', b' %%EOF'),
(b'', b''),
(b'%%EOFB', b'%%EOF'),
(b'%%EOF%%%EOFb', b'%%EOF%%%EOF'),
(b'%%EO', b'%%EO'),
(b' %%EOF', b' %%EOF'),
(b'%B%EOF', b'%B%EOF'),
])
def test_normal(content, expected):
assert fix_PDF_EOF(content=io.BytesIO(content)).read() == expected
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment