Created
January 8, 2017 10:42
-
-
Save dnwe/8da9b5262e4615dbc512cdcef5ff5830 to your computer and use it in GitHub Desktop.
Pure Python script to count the number of pages in a directory of PDF files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
import os | |
import sys | |
import re | |
import fnmatch | |
import logging | |
logging.basicConfig( | |
stream=sys.stdout, | |
level=logging.INFO, | |
format='%(levelname)-8s %(message)s') | |
rxcountpages = re.compile(r"/Type\s*/Page([^s]|$)", re.MULTILINE|re.DOTALL) | |
def count_pages(pdf_name): | |
x = 0 | |
with open(pdf_name, mode='rb') as f: | |
x = len(rxcountpages.findall(f.read())) | |
logging.info('%s has %d', pdf_name, int(x)) | |
return x | |
def main(): | |
'''main entrypoint''' | |
if len(sys.argv) != 2: | |
sys.stderr.writelines('Usage: {} <directory>\n'.format(sys.argv[0])) | |
sys.exit(1) | |
check_dir = sys.argv[1] | |
logging.info('Counting pages of any PDFs in %s', check_dir) | |
total_pages = 0 | |
for path, subdirs, files in os.walk(check_dir): | |
for name in fnmatch.filter(files, '*.pdf'): | |
pdf_name = os.path.join(path, name) | |
total_pages += count_pages(pdf_name) | |
print('-' * 80) | |
print('Total pages in {}: {}'.format(check_dir, total_pages)) | |
print('-' * 80) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment