Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
PDF Page Extraction/Selection in Python Using PyPDF
#! /usr/bin/env python
###############################################################################
##
## Copyright 2012 Jeet Sukumaran.
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program. If not, see <http://www.gnu.org/licenses/>.
##
###############################################################################
"""
Extract specified pages from source PDF.
"""
import sys
import os
import argparse
import pyPdf
__prog__ = os.path.basename(__file__)
__version__ = "1.0.0"
__description__ = __doc__
__author__ = 'Jeet Sukumaran'
__copyright__ = 'Copyright (C) 2012 Jeet Sukumaran.'
def main():
"""
Main CLI handler.
"""
parser = argparse.ArgumentParser(description=__description__)
parser.add_argument("--version", action="version", version="%(prog)s " + __version__)
parser.add_argument("src_pdf",
metavar="SOURCE-PDF",
type=argparse.FileType('rb'),
help="path to input pdf file")
parser.add_argument("first_page",
metavar="FIRST-PAGE",
type=int,
help="number of first page (1-based index: first page is '1')")
parser.add_argument("last_page",
metavar="LAST-PAGE",
type=str,
help="number of last page; if preceded by '+' (e.g., '+30'), specifies number of pages following first page to extract")
parser.add_argument("-o", "--output-filepath",
type=str,
default=None,
help="path to output file (if not given, will write to standard output)")
args = parser.parse_args()
first_page = args.first_page - 1
if args.last_page.startswith("+"):
last_page = args.last_page[1:].replace(" ", "")
if not last_page:
sys.exit("Need to specify number of pages")
last_page = first_page + int(last_page)
else:
last_page = int(args.last_page) - 1
pdf_in = pyPdf.PdfFileReader(args.src_pdf)
pdf_out = pyPdf.PdfFileWriter()
for pg_num in range(first_page, last_page + 1):
pdf_out.addPage(pdf_in.getPage(pg_num))
if args.output_filepath:
out_stream = open(os.path.expandvars(os.path.expanduser(args.output_filepath)), "wb")
else:
out_stream = sys.stdout
pdf_out.write(out_stream)
out_stream.close()
if __name__ == '__main__':
main()
@maroneal1

This comment has been minimized.

Copy link

maroneal1 commented Jun 19, 2014

How did you get the pdf_out.write(out_stream) to work, if outstream is sys.stdout, there are errors becuase sys stdout does not support tell (). ie I get error 29 illegal seek, from this line of code , object_positions.append(stream.tell())
when sys.stdout is passed as stream

@devnowcommit

This comment has been minimized.

Copy link

devnowcommit commented May 15, 2018

Could you please tell if this is still working? Thank you very much!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.