Skip to content

Instantly share code, notes, and snippets.

@Tantalus13A98B5F
Last active January 13, 2020 10:08
Show Gist options
  • Save Tantalus13A98B5F/1af69d830f0c74eaecc62e2edb662f36 to your computer and use it in GitHub Desktop.
Save Tantalus13A98B5F/1af69d830f0c74eaecc62e2edb662f36 to your computer and use it in GitHub Desktop.
Simple PDF parser
'''My Simple PDF Parser
Can inspect the size of a PDF file.
'''
from contextlib import suppress, redirect_stdout
from collections import deque, OrderedDict
from weakref import ref
import re
class PDFRef:
def __init__(self, refstr, top):
self.refstr = refstr
self.objstr = refstr.replace('R', 'obj')
self.top = ref(top)
def follow(self):
return self.top().objs[self.objstr]
def __repr__(self):
return '<PDFRef("%s")>' % self.refstr
class PDFData:
# IMPORTANT: `re` respects the order of patterns (thus may not be longest)
_patterns = OrderedDict([
('REF', r'\d+ \d+ R'),
('PUNC', r'\[|\]'),
('FLOAT', r'-?(?:\d+\.\d*|\.\d+)'),
('INT', r'\d+'),
('STR', r'\((?:\\\)|[^)])+\)'),
('NAME', r'/[-_A-Za-z0-9]+'),
('BOOL', r'true|false'),
('BLANK', r'\s+'),
('ELSE', r'.')
])
_value_pattern = re.compile('|'.join(
r'(?P<%s>%s)' % (k, v) for k, v in _patterns.items()))
def _parse_val(self, val):
val = val.decode()
cur = [[]]
for mo in self._value_pattern.finditer(val):
kind = mo.lastgroup
value = mo.group()
assert kind != 'ELSE'
if value == '[':
newlist = []
cur[-1].append(newlist)
cur.append(newlist)
elif value == ']':
cur.pop()
elif kind == 'INT':
cur[-1].append(int(value))
elif kind == 'REF':
cur[-1].append(PDFRef(value, self))
elif kind == 'BLANK':
continue
else:
cur[-1].append(value)
return cur[0][0]
def _parse_dict(self, f):
cur, openkeys, expect = [{}], ['root'], 'value'
while openkeys:
ln = f.readline().strip()
while openkeys and ln:
if expect == 'value':
if ln.startswith(b'<<'):
cur.append({})
ln = ln[2:]
else: # common value, no recur
idx = ln.find(b'>>')
idx = len(ln) if idx < 0 else idx
val, ln = ln[:idx], ln[idx:]
cur[-1][openkeys.pop()] = self._parse_val(val)
expect = 'key'
else: # expect == 'key'
if ln.startswith(b'>>'): # no more keys
val, ln = cur.pop(), ln[2:]
cur[-1][openkeys.pop()] = val
else: # new key
key, ln = ln.split(b' ', 1)
openkeys.append(key.decode())
expect = 'value'
ln = ln.lstrip()
ret = cur[0]['root']
if ln == b'stream':
ret['_stream'] = f.tell()
while f.readline() != b'endstream\n':
pass
return ret
def _resolve_pages(self):
root = self.trailer['/Root'].follow()
pageset_queue = deque([ root['/Pages'] ])
ret = []
while pageset_queue:
idx = pageset_queue.popleft()
cur = idx.follow()
if cur['/Type'] == '/Page':
ret.append(idx)
elif cur['/Type'] == '/Pages':
for item in cur['/Kids']:
pageset_queue.append(item)
return ret
@classmethod
def parse_pdf(cls, f):
data = cls()
data.objs = {}
while True:
ln = f.readline()
if not ln:
break
elif ln.startswith(b'%'):
continue
elif ln == b'startxref\n':
ln2 = f.readline()
data.startxref = int(ln2.strip())
elif ln == b'trailer\n':
data.trailer = data._parse_dict(f)
elif ln.endswith(b' obj\n'):
data.objs[ln.strip().decode()] = data._parse_dict(f)
ln2 = f.readline()
assert ln2 == b'endobj\n'
elif ln == b'xref\n':
pass
data.pages = data._resolve_pages()
return data
def walk_object(obj):
iterable = obj.items() if isinstance(obj, dict) else enumerate(obj)
for key, obj in iterable:
if key == '/Parent':
continue
elif isinstance(obj, PDFRef):
yield [key], obj
elif isinstance(obj, dict) or isinstance(obj, list):
for keys, val in walk_object(obj):
keys.append(key)
yield keys, val
def parse_links(objs):
for item in objs.values():
links = item['_links'] = []
for keys, dst in walk_object(item):
keys.reverse()
links.append((dst, tuple(keys)))
def print_links(objs):
print('digraph objref {')
for src, obj in objs.items():
for dst, link in obj['_links']:
dstobj = dst.follow()
if dstobj.get('/BM') == '/Normal':
continue
if dstobj.get('/Type') == '/ExtGState' and dstobj.get('/SMask') == '/None':
continue
print('"{}" -> "{}"'.format(src, dst.objstr))
attrs = {'label': src, 'style': 'filled'}
size = obj.get('/Length', 0)
if size > 2**20:
attrs['fillcolor'] = 'red'
attrs['label'] += ' [%sM]' % round(size / 2**20, 1)
elif size > 2**10:
attrs['fillcolor'] = 'yellow'
attrs['label'] += ' [%sK]' % round(size / 2**10, 1)
else:
del attrs['style']
typename = obj.get('/Type', '') + obj.get('/Subtype', '')
if typename:
attrs['label'] += '\\n' + typename
attrstr = ', '.join('{} = "{}"'.format(k, v) for k, v in attrs.items())
print('"{}" [{}]'.format(src, attrstr))
print('}')
with open('yyj.pdf', 'rb') as f:
data = PDFData.parse_pdf(f)
images = []
for n, pageref in enumerate(data.pages, 1):
page = pageref.follow()
xo = page.get('/Resources', {}).get('/XObject', {})
tmp = []
images.append((n, pageref, tmp))
for idx in xo.values():
item = idx.follow()
tmp.append((idx, item.get('/Subtype'), item.get('/Length')))
# with open('output.dot', 'w') as f, redirect_stdout(f):
# # dot <file.dot> -Tpng -o <output.png>
# parse_links(data.objs)
# print_links(data.objs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment