Last active
January 13, 2020 10:08
-
-
Save Tantalus13A98B5F/1af69d830f0c74eaecc62e2edb662f36 to your computer and use it in GitHub Desktop.
Simple PDF parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''My Simple PDF Parser | |
Can inspect the size of a PDF file. | |
''' | |
from contextlib import suppress, redirect_stdout | |
from collections import deque, OrderedDict | |
from weakref import ref | |
import re | |
class PDFRef: | |
def __init__(self, refstr, top): | |
self.refstr = refstr | |
self.objstr = refstr.replace('R', 'obj') | |
self.top = ref(top) | |
def follow(self): | |
return self.top().objs[self.objstr] | |
def __repr__(self): | |
return '<PDFRef("%s")>' % self.refstr | |
class PDFData: | |
# IMPORTANT: `re` respects the order of patterns (thus may not be longest) | |
_patterns = OrderedDict([ | |
('REF', r'\d+ \d+ R'), | |
('PUNC', r'\[|\]'), | |
('FLOAT', r'-?(?:\d+\.\d*|\.\d+)'), | |
('INT', r'\d+'), | |
('STR', r'\((?:\\\)|[^)])+\)'), | |
('NAME', r'/[-_A-Za-z0-9]+'), | |
('BOOL', r'true|false'), | |
('BLANK', r'\s+'), | |
('ELSE', r'.') | |
]) | |
_value_pattern = re.compile('|'.join( | |
r'(?P<%s>%s)' % (k, v) for k, v in _patterns.items())) | |
def _parse_val(self, val): | |
val = val.decode() | |
cur = [[]] | |
for mo in self._value_pattern.finditer(val): | |
kind = mo.lastgroup | |
value = mo.group() | |
assert kind != 'ELSE' | |
if value == '[': | |
newlist = [] | |
cur[-1].append(newlist) | |
cur.append(newlist) | |
elif value == ']': | |
cur.pop() | |
elif kind == 'INT': | |
cur[-1].append(int(value)) | |
elif kind == 'REF': | |
cur[-1].append(PDFRef(value, self)) | |
elif kind == 'BLANK': | |
continue | |
else: | |
cur[-1].append(value) | |
return cur[0][0] | |
def _parse_dict(self, f): | |
cur, openkeys, expect = [{}], ['root'], 'value' | |
while openkeys: | |
ln = f.readline().strip() | |
while openkeys and ln: | |
if expect == 'value': | |
if ln.startswith(b'<<'): | |
cur.append({}) | |
ln = ln[2:] | |
else: # common value, no recur | |
idx = ln.find(b'>>') | |
idx = len(ln) if idx < 0 else idx | |
val, ln = ln[:idx], ln[idx:] | |
cur[-1][openkeys.pop()] = self._parse_val(val) | |
expect = 'key' | |
else: # expect == 'key' | |
if ln.startswith(b'>>'): # no more keys | |
val, ln = cur.pop(), ln[2:] | |
cur[-1][openkeys.pop()] = val | |
else: # new key | |
key, ln = ln.split(b' ', 1) | |
openkeys.append(key.decode()) | |
expect = 'value' | |
ln = ln.lstrip() | |
ret = cur[0]['root'] | |
if ln == b'stream': | |
ret['_stream'] = f.tell() | |
while f.readline() != b'endstream\n': | |
pass | |
return ret | |
def _resolve_pages(self): | |
root = self.trailer['/Root'].follow() | |
pageset_queue = deque([ root['/Pages'] ]) | |
ret = [] | |
while pageset_queue: | |
idx = pageset_queue.popleft() | |
cur = idx.follow() | |
if cur['/Type'] == '/Page': | |
ret.append(idx) | |
elif cur['/Type'] == '/Pages': | |
for item in cur['/Kids']: | |
pageset_queue.append(item) | |
return ret | |
@classmethod | |
def parse_pdf(cls, f): | |
data = cls() | |
data.objs = {} | |
while True: | |
ln = f.readline() | |
if not ln: | |
break | |
elif ln.startswith(b'%'): | |
continue | |
elif ln == b'startxref\n': | |
ln2 = f.readline() | |
data.startxref = int(ln2.strip()) | |
elif ln == b'trailer\n': | |
data.trailer = data._parse_dict(f) | |
elif ln.endswith(b' obj\n'): | |
data.objs[ln.strip().decode()] = data._parse_dict(f) | |
ln2 = f.readline() | |
assert ln2 == b'endobj\n' | |
elif ln == b'xref\n': | |
pass | |
data.pages = data._resolve_pages() | |
return data | |
def walk_object(obj): | |
iterable = obj.items() if isinstance(obj, dict) else enumerate(obj) | |
for key, obj in iterable: | |
if key == '/Parent': | |
continue | |
elif isinstance(obj, PDFRef): | |
yield [key], obj | |
elif isinstance(obj, dict) or isinstance(obj, list): | |
for keys, val in walk_object(obj): | |
keys.append(key) | |
yield keys, val | |
def parse_links(objs): | |
for item in objs.values(): | |
links = item['_links'] = [] | |
for keys, dst in walk_object(item): | |
keys.reverse() | |
links.append((dst, tuple(keys))) | |
def print_links(objs): | |
print('digraph objref {') | |
for src, obj in objs.items(): | |
for dst, link in obj['_links']: | |
dstobj = dst.follow() | |
if dstobj.get('/BM') == '/Normal': | |
continue | |
if dstobj.get('/Type') == '/ExtGState' and dstobj.get('/SMask') == '/None': | |
continue | |
print('"{}" -> "{}"'.format(src, dst.objstr)) | |
attrs = {'label': src, 'style': 'filled'} | |
size = obj.get('/Length', 0) | |
if size > 2**20: | |
attrs['fillcolor'] = 'red' | |
attrs['label'] += ' [%sM]' % round(size / 2**20, 1) | |
elif size > 2**10: | |
attrs['fillcolor'] = 'yellow' | |
attrs['label'] += ' [%sK]' % round(size / 2**10, 1) | |
else: | |
del attrs['style'] | |
typename = obj.get('/Type', '') + obj.get('/Subtype', '') | |
if typename: | |
attrs['label'] += '\\n' + typename | |
attrstr = ', '.join('{} = "{}"'.format(k, v) for k, v in attrs.items()) | |
print('"{}" [{}]'.format(src, attrstr)) | |
print('}') | |
with open('yyj.pdf', 'rb') as f: | |
data = PDFData.parse_pdf(f) | |
images = [] | |
for n, pageref in enumerate(data.pages, 1): | |
page = pageref.follow() | |
xo = page.get('/Resources', {}).get('/XObject', {}) | |
tmp = [] | |
images.append((n, pageref, tmp)) | |
for idx in xo.values(): | |
item = idx.follow() | |
tmp.append((idx, item.get('/Subtype'), item.get('/Length'))) | |
# with open('output.dot', 'w') as f, redirect_stdout(f): | |
# # dot <file.dot> -Tpng -o <output.png> | |
# parse_links(data.objs) | |
# print_links(data.objs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment