Skip to content

Instantly share code, notes, and snippets.

@dnwe
Created January 8, 2017 10:42
Show Gist options
  • Save dnwe/8da9b5262e4615dbc512cdcef5ff5830 to your computer and use it in GitHub Desktop.
Save dnwe/8da9b5262e4615dbc512cdcef5ff5830 to your computer and use it in GitHub Desktop.
Pure Python script to count the number of pages in a directory of PDF files
#!/usr/bin/env python2
import os
import sys
import re
import fnmatch
import logging
logging.basicConfig(
stream=sys.stdout,
level=logging.INFO,
format='%(levelname)-8s %(message)s')
rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL)
def count_pages(pdf_name):
x = 0
with open(pdf_name, mode='rb') as f:
x = len(rxcountpages.findall(f.read()))
logging.info('%s has %d', pdf_name, int(x))
return x
def main():
'''main entrypoint'''
if len(sys.argv) != 2:
sys.stderr.writelines('Usage: {} <directory>\n'.format(sys.argv[0]))
sys.exit(1)
check_dir = sys.argv[1]
logging.info('Counting pages of any PDFs in %s', check_dir)
total_pages = 0
for path, subdirs, files in os.walk(check_dir):
for name in fnmatch.filter(files, '*.pdf'):
pdf_name = os.path.join(path, name)
total_pages += count_pages(pdf_name)
print('-' * 80)
print('Total pages in {}: {}'.format(check_dir, total_pages))
print('-' * 80)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment