Created
December 9, 2012 14:48
-
-
Save tsuyukimakoto/4245373 to your computer and use it in GitHub Desktop.
PDFMiner sample.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from pdfminer.pdfinterp import PDFResourceManager, process_pdf | |
from pdfminer.pdfdevice import PDFDevice | |
class ObjectFindDevice(PDFDevice): | |
debug = 0 | |
def __init__(self, rsrcmgr): | |
self.rsrcmgr = rsrcmgr | |
self.ctm = None | |
self.fonts = set() | |
self.nof_pages = 0 | |
self.pages = set() | |
return | |
def begin_page(self, page, ctm): | |
self.nof_pages += 1 | |
self.pages.add(page) | |
def end_page(self, page): | |
return | |
def begin_figure(self, name, bbox, matrix): | |
return | |
def end_figure(self, name): | |
return | |
def paint_path(self, graphicstate, stroke, fill, evenodd, path): | |
return | |
def render_image(self, name, stream): | |
return | |
def render_string(self, textstate, seq): | |
self.fonts.add(textstate.font) | |
def convert_point_to_mm(box): | |
''' | |
72point = 1inch = 25.4mm | |
>>> convert_point_to_mm([0, 0, 283.4646, 419.5276]) | |
[0.0, 0.0, 100.00001166666667, 148.00001444444442] | |
''' | |
return [(x / 72.0 * 25.4) for x in box] | |
if __name__ == '__main__': | |
rsrcmgr = PDFResourceManager(caching=True) | |
device = ObjectFindDevice(rsrcmgr) | |
with open('test_data/omotemen.pdf') as fp: | |
process_pdf(rsrcmgr, device, fp) | |
print('fonts used: {}'.format(len(device.fonts))) | |
print('no of pages: {}'.format(device.nof_pages)) | |
for page in device.pages: | |
print(page.mediabox) | |
print(convert_point_to_mm(page.mediabox)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment