-
-
Save pombredanne/2b9dd79cc5767963c71c23ce6777fe34 to your computer and use it in GitHub Desktop.
A simple python wrapper of the pdfinfo command line tool.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pdfinfo(infile): | |
""" | |
Wraps command line utility pdfinfo to extract the PDF meta information. | |
Returns metainfo in a dictionary. | |
sudo apt-get install poppler-utils | |
This function parses the text output that looks like this: | |
Title: PUBLIC MEETING AGENDA | |
Author: Customer Support | |
Creator: Microsoft Word 2010 | |
Producer: Microsoft Word 2010 | |
CreationDate: Thu Dec 20 14:44:56 2012 | |
ModDate: Thu Dec 20 14:44:56 2012 | |
Tagged: yes | |
Pages: 2 | |
Encrypted: no | |
Page size: 612 x 792 pts (letter) | |
File size: 104739 bytes | |
Optimized: no | |
PDF version: 1.5 | |
""" | |
import os.path as osp | |
cmd = '/usr/bin/pdfinfo' | |
if not osp.exists(cmd): | |
raise RuntimeError('System command not found: %s' % cmd) | |
if not osp.exists(infile): | |
raise RuntimeError('Provided input file not found: %s' % infile) | |
def _extract(row): | |
"""Extracts the right hand value from a : delimited row""" | |
return row.split(':', 1)[1].strip() | |
output = {} | |
labels = ['Title', 'Author', 'Creator', 'Producer', 'CreationDate', | |
'ModDate', 'Tagged', 'Pages', 'Encrypted', 'Page size', | |
'File size', 'Optimized', 'PDF version'] | |
cmd_output = subprocess.check_output([cmd, infile]) | |
for line in cmd_output.splitlines(): | |
for label in labels: | |
if label in line: | |
output[label] = _extract(line) | |
return output |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment