Skip to content

Instantly share code, notes, and snippets.

@shentonfreude
Created July 26, 2019 14:27
Show Gist options
  • Save shentonfreude/7ed79526b5c4f290a0e5631cfb31a1a6 to your computer and use it in GitHub Desktop.
Save shentonfreude/7ed79526b5c4f290a0e5631cfb31a1a6 to your computer and use it in GitHub Desktop.
Modified Ned Batchelder's PDF to JPG extractor for python3 and to be used as a library for 1-page extraction
#!/usr/bin/env python3
# Adapted from https://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html
"""Extract jpg's from pdf's. Quick and dirty."""
import sys
STARTMARK = b"\xff\xd8"
STARTFIX = 0
ENDMARK = b"\xff\xd9"
ENDFIX = 2
def extract_jpg_from_pdf(pdf_path):
"""Extract JPG from single-page PDF scan, return as bytes.
No coversion involved so faster than GhostScript or ImageMagick,
and also no loss due to conversion.
This mutation of Batchelder's work only handles a single page.
Only works with scanned PDF images, not text PDFs.
May not always be reliable
Past peformance is no guarantee of future results.
Use only under a doctor's supervision.
"""
pdf = open(pdf_path, "rb").read()
i = 0
while True:
istream = pdf.find(b"stream", i)
if istream < 0:
break
istart = pdf.find(STARTMARK, istream, istream + 20)
if istart < 0:
i = istream + 20
continue
iend = pdf.find(b"endstream", istart)
if iend < 0:
raise Exception("Did not find end of stream!")
iend = pdf.find(ENDMARK, iend - 20)
if iend < 0:
raise Exception("Did not find end of JPG!")
istart += STARTFIX
iend += ENDFIX
# print("JPG from %d to %d" % (istart, iend))
jpg = pdf[istart:iend]
return jpg
raise Exception(f"Could not extract JPG from PDF={pdf_path}")
def main():
"""Test it out."""
jpg = extract_jpg_from_pdf(sys.argv[1])
jpgfile = open("jpgextracted.jpg", "wb")
jpgfile.write(jpg)
jpgfile.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment