Created
September 7, 2017 17:08
-
-
Save TaylorMutch/8bf8f74043387ec05b62878d13a417bd to your computer and use it in GitHub Desktop.
Extracts JPEG images from a PDF.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Extracts JPGs from a PDF. Inspired by this StackOverflow post: | |
https://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python | |
""" | |
import sys | |
with open(sys.argv[1],"rb") as file: | |
file.seek(0) | |
pdf = file.read() | |
startmark = b"\xff\xd8" | |
startfix = 0 | |
endmark = b"\xff\xd9" | |
endfix = 2 | |
i = 0 | |
njpg = 0 | |
while True: | |
istream = pdf.find(b"stream", i) | |
if istream < 0: | |
break | |
istart = pdf.find(startmark, istream, istream + 20) | |
if istart < 0: | |
i = istream + 20 | |
continue | |
iend = pdf.find(b"endstream", istart) | |
if iend < 0: | |
raise Exception("Didn't find end of stream!") | |
iend = pdf.find(endmark, iend - 20) | |
if iend < 0: | |
raise Exception("Didn't find end of JPG!") | |
istart += startfix | |
iend += endfix | |
print("JPG %d from %d to %d" % (njpg, istart, iend)) | |
jpg = pdf[istart:iend] | |
with open("jpg%d.jpg" % njpg, "wb") as jpgfile: | |
jpgfile.write(jpg) | |
njpg += 1 | |
i = iend |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment