Skip to content

Instantly share code, notes, and snippets.

  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jamiesanson/bfa3236788f59089fdb2e9e9a83f609d to your computer and use it in GitHub Desktop.
PDF Counting in Python
#!/usr/bin/env python3
import os
import sys
import textract
def main():
if len(sys.argv) < 2:
print('command usage: python count.py FileName <optional count to exclude>')
exit(1)
else:
default_exclusion_count = 0
pdf_file = sys.argv[1]
count_to_exclude = sys.argv[2] if len(sys.argv) == 3 else default_exclusion_count
try:
if os.path.exists(pdf_file):
pass
except OSError as err:
print(str(err))
exit(1)
try:
count = len(textract.process(pdf_file).split(b' ')) - count_to_exclude
except Exception as err:
print('')
print('-------------------------------------------------------------------------------')
print("Extraction module for Textract not found. Install this from below and try again")
print('-------------------------------------------------------------------------------')
print('')
raise
print("Word count in %s: %d" %(pdf_file, count))
if __name__ == '__main__':
main()
pip install textract
# If this doesnt work, or fails at some point do this instead
pip install --no-deps textract
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment