Skip to content

Instantly share code, notes, and snippets.

@zoeyfyi
Created March 19, 2017 02:30
Show Gist options
  • Save zoeyfyi/b4d0172f45811cab572e36e9a5c0b2e5 to your computer and use it in GitHub Desktop.
Save zoeyfyi/b4d0172f45811cab572e36e9a5c0b2e5 to your computer and use it in GitHub Desktop.
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams, LTRect, LTLine, LTTextBoxHorizontal
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
from pdfminer.converter import PDFPageAggregator
from wand.image import Image
laparams = LAParams()
rsrcmgr = PDFResourceManager()
document = file('c3u.pdf', 'rb')
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for i, page in enumerate(PDFPage.get_pages(document)):
# if i == 3:
# break
print "Page {}".format(i)
interpreter.process_page(page)
layout = device.get_result()
# print layout._objs
space = 0
textboxes = [r for r in layout._objs if type(r) is LTTextBoxHorizontal ]
for t in textboxes:
if t.get_text().startswith("Answer space for question"):
space = t.y0
path = os.path.join(os.getcwd(), "c3u.pdf[{}]".format(i))
scale = 3
with Image(filename=path, resolution=int(72*scale)) as img:
# Crop the margins
x = 46 * scale
y = 66 * scale
width = 489 * scale
height = 711 * scale
height -= max((space - 51) * scale, 1)
img.crop(int(x), int(y), width=int(width), height=int(height))
img.save(filename="test/test{}.jpg".format(i))
@zoeyfyi
Copy link
Author

zoeyfyi commented Mar 19, 2017

  • Remove front page
  • Remove blank pages
  • Handle drawing boxes
  • Test more papers

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment