Skip to content

Instantly share code, notes, and snippets.

@J08nY
Last active March 8, 2017 21:25
Show Gist options
  • Save J08nY/e965805543a9b2a72672 to your computer and use it in GitHub Desktop.
Save J08nY/e965805543a9b2a72672 to your computer and use it in GitHub Desktop.
PDF page counter
#!/usr/bin/env python
from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError
import re
import argparse
from os import walk, path
import magic
import hashlib
class Pdf(object):
pdf_pages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)
def __init__(self, path):
self.path = path
self.length = None
self.hash = self.rehash()
def count(self):
with open(self.path, "rb") as f:
data = f.read()
lines = len(Pdf.pdf_pages.findall(data))
if lines == 0:
f.seek(0)
try:
lines = PdfFileReader(f).getNumPages()
except PdfReadError as pre:
lines = 0
return lines
def __len__(self):
if self.length is None:
self.length = self.count()
return self.length
def rehash(self):
blocksize = 65536
md5 = hashlib.md5()
with open(self.path, "rb") as f:
buf = f.read(blocksize)
while len(buf) > 0:
md5.update(buf)
buf = f.read(blocksize)
return md5.hexdigest()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Counts the number of pages in pdf files in a directory.")
parser.add_argument("-e", "--exclude", dest="excludes", action="append", help="Directories/files to exclude")
parser.add_argument("directory", nargs="?", default=".", help="Directory to count")
parser.add_argument("-r", "--recursive", dest="recursive", action="store_true", help="Recurse into subdirs.")
args = parser.parse_args()
if not path.exists(args.directory):
print "Directory/File doesnt exist."
exit(1)
pages = 0
seen = []
m = magic.open(magic.MAGIC_MIME)
m.load()
if path.isfile(args.directory):
pdf = Pdf(args.directory)
pages = pdf.count()
print args.directory, pages
else:
for root, dirs, files in walk(args.directory):
if args.excludes is not None:
dirs[:] = [d for d in dirs if d not in args.excludes]
files[:] = [f for f in files if f not in args.excludes]
for f in files:
file_path = path.join(root, f)
file_type = m.file(file_path)
if "application/pdf" in file_type:
pdf = Pdf(file_path)
if pdf.hash in seen:
continue
seen.append(pdf.hash)
p = pdf.count()
print f, p
pages+=p
if not args.recursive:
break
print "## Total:", pages
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment