Skip to content

Instantly share code, notes, and snippets.

@tsuyukimakoto
Created December 9, 2012 14:48
Show Gist options
  • Save tsuyukimakoto/4245373 to your computer and use it in GitHub Desktop.
Save tsuyukimakoto/4245373 to your computer and use it in GitHub Desktop.
PDFMiner sample.
import sys
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.pdfdevice import PDFDevice
class ObjectFindDevice(PDFDevice):
debug = 0
def __init__(self, rsrcmgr):
self.rsrcmgr = rsrcmgr
self.ctm = None
self.fonts = set()
self.nof_pages = 0
self.pages = set()
return
def begin_page(self, page, ctm):
self.nof_pages += 1
self.pages.add(page)
def end_page(self, page):
return
def begin_figure(self, name, bbox, matrix):
return
def end_figure(self, name):
return
def paint_path(self, graphicstate, stroke, fill, evenodd, path):
return
def render_image(self, name, stream):
return
def render_string(self, textstate, seq):
self.fonts.add(textstate.font)
def convert_point_to_mm(box):
'''
72point = 1inch = 25.4mm
>>> convert_point_to_mm([0, 0, 283.4646, 419.5276])
[0.0, 0.0, 100.00001166666667, 148.00001444444442]
'''
return [(x / 72.0 * 25.4) for x in box]
if __name__ == '__main__':
rsrcmgr = PDFResourceManager(caching=True)
device = ObjectFindDevice(rsrcmgr)
with open('test_data/omotemen.pdf') as fp:
process_pdf(rsrcmgr, device, fp)
print('fonts used: {}'.format(len(device.fonts)))
print('no of pages: {}'.format(device.nof_pages))
for page in device.pages:
print(page.mediabox)
print(convert_point_to_mm(page.mediabox))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment