pankajthekush/README.md

## README.md

      
    Raw
  

              README.md
            
          
    PDFINFO example

This has now been modified to work with either Python 2 or Python 3.
An example has been added, see example.py.
python ./example.py
{'Tagged': 'no', 'Producer': 'Mac OS X 10.12.1 Quartz PDFContext', 'Creator': 'Word', 'Encrypted': 'no', 'Author': 'Shekhar Vemuri', 'File size': '6264512 bytes', 'Optimized': 'no', 'PDF version': '1.3', 'ModDate': 'Thu Dec  8 11:42:16 2016 MST', 'Title': 'Guide to Apache Airflow', 'Page size': '612 x 792 pts (letter)', 'CreationDate': 'Thu Dec  8 11:42:16 2016 MST', 'Pages': '6'}

  
## example.py
from pdfinfo import pdfinfo

i = pdfinfo("/tmp/GuideToApacheAirflow.pdf")
print(i)

## pdfinfo.py

import subprocess
import os.path as osp


def pdfinfo(infile):
    """
    Wraps command line utility pdfinfo to extract the PDF meta information.
    Returns metainfo in a dictionary.
    sudo apt-get install poppler-utils

    This function parses the text output that looks like this:
        Title:          PUBLIC MEETING AGENDA
        Author:         Customer Support
        Creator:        Microsoft Word 2010
        Producer:       Microsoft Word 2010
        CreationDate:   Thu Dec 20 14:44:56 2012
        ModDate:        Thu Dec 20 14:44:56 2012
        Tagged:         yes
        Pages:          2
        Encrypted:      no
        Page size:      612 x 792 pts (letter)
        File size:      104739 bytes
        Optimized:      no
        PDF version:    1.5
    """

    cmd = '/usr/bin/pdfinfo'
    if not osp.exists(cmd):
        raise RuntimeError('System command not found: %s' % cmd)

    if not osp.exists(infile):
        raise RuntimeError('Provided input file not found: %s' % infile)

    def _extract(row):
        """Extracts the right hand value from a : delimited row"""
        return row.split(':', 1)[1].strip()

    output = {}

    labels = ['Title', 'Author', 'Creator', 'Producer', 'CreationDate',
              'ModDate', 'Tagged', 'Pages', 'Encrypted', 'Page size',
              'File size', 'Optimized', 'PDF version']

    ps = subprocess.run([cmd, infile],capture_output=True)
    return_code = ps.returncode
    if return_code != 0:
        raise RuntimeError('error while running pdfinfo')

    ps_out = ps.stdout.decode(encoding='utf-8')
    for line in map(str, ps_out.splitlines()):
        for label in labels:
            if label in line:
                output[label] = _extract(line)

    return output
	from pdfinfo import pdfinfo

	i = pdfinfo("/tmp/GuideToApacheAirflow.pdf")
	print(i)

	import subprocess
	import os.path as osp


	def pdfinfo(infile):
	"""
	Wraps command line utility pdfinfo to extract the PDF meta information.
	Returns metainfo in a dictionary.
	sudo apt-get install poppler-utils

	This function parses the text output that looks like this:
	Title: PUBLIC MEETING AGENDA
	Author: Customer Support
	Creator: Microsoft Word 2010
	Producer: Microsoft Word 2010
	CreationDate: Thu Dec 20 14:44:56 2012
	ModDate: Thu Dec 20 14:44:56 2012
	Tagged: yes
	Pages: 2
	Encrypted: no
	Page size: 612 x 792 pts (letter)
	File size: 104739 bytes
	Optimized: no
	PDF version: 1.5
	"""

	cmd = '/usr/bin/pdfinfo'
	if not osp.exists(cmd):
	raise RuntimeError('System command not found: %s' % cmd)

	if not osp.exists(infile):
	raise RuntimeError('Provided input file not found: %s' % infile)

	def _extract(row):
	"""Extracts the right hand value from a : delimited row"""
	return row.split(':', 1)[1].strip()

	output = {}

	labels = ['Title', 'Author', 'Creator', 'Producer', 'CreationDate',
	'ModDate', 'Tagged', 'Pages', 'Encrypted', 'Page size',
	'File size', 'Optimized', 'PDF version']

	ps = subprocess.run([cmd, infile],capture_output=True)
	return_code = ps.returncode
	if return_code != 0:
	raise RuntimeError('error while running pdfinfo')

	ps_out = ps.stdout.decode(encoding='utf-8')
	for line in map(str, ps_out.splitlines()):
	for label in labels:
	if label in line:
	output[label] = _extract(line)

	return output