Last active
February 16, 2019 23:35
-
-
Save hyzyla/242d8e3ab5c1355f5212c676cc895f82 to your computer and use it in GitHub Desktop.
Function that removes any trash in PDF after last %%EOF marker
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import os | |
EOF_REVERSED = b'%%EOF'[::-1] | |
def fix_PDF_EOF(content: io.BytesIO) -> io.BytesIO: | |
""" Truncate data from last %%EOF in given PDF """ | |
if not content.seekable(): | |
raise ValueError('Content must support random access') | |
initial_position = content.tell() | |
# move stream position to the end | |
content.seek(0, os.SEEK_END) | |
# initialize values | |
buffer = b'' | |
next_byte = EOF_REVERSED[len(buffer): len(buffer) + 1] | |
position = resize_position = content.tell() | |
while position > 0: | |
position -= 1 | |
content.seek(position) | |
current_byte = content.read(1) | |
if next_byte == current_byte: | |
# append byte to buffer | |
buffer += current_byte | |
if len(buffer) == len(EOF_REVERSED): | |
resize_position = position | |
break | |
else: | |
buffer = b'' | |
next_byte = EOF_REVERSED[len(buffer): len(buffer) + 1] | |
# truncate data after %%EOF | |
content.truncate(resize_position + len(EOF_REVERSED)) | |
# move stream to initial position | |
content.seek(initial_position) | |
return content | |
# tests.py ===================================================== | |
import io | |
import pytest | |
@pytest.mark.parametrize('content, expected', [ | |
(b'%%EOF', b'%%EOF'), | |
(b'%%EOF ', b'%%EOF'), | |
(b' %%EOF ', b' %%EOF'), | |
(b' %%EOF', b' %%EOF'), | |
(b'', b''), | |
(b'%%EOFB', b'%%EOF'), | |
(b'%%EOF%%%EOFb', b'%%EOF%%%EOF'), | |
(b'%%EO', b'%%EO'), | |
(b' %%EOF', b' %%EOF'), | |
(b'%B%EOF', b'%B%EOF'), | |
]) | |
def test_normal(content, expected): | |
assert fix_PDF_EOF(content=io.BytesIO(content)).read() == expected |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment