Skip to content

Instantly share code, notes, and snippets.

@pjatx
Last active September 25, 2018 01:41
Show Gist options
  • Save pjatx/eeaf842259b618403a9cd649b11014c1 to your computer and use it in GitHub Desktop.
Save pjatx/eeaf842259b618403a9cd649b11014c1 to your computer and use it in GitHub Desktop.
Rename resume pdf as first email address found
# Import global broad stuff
import os
import os.path
import shutil
import re
from optparse import OptionParser
# Import PDF Miner specific stuff to use as library
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
# Global Variables
src_dir = os.path.join(os.curdir, 'to-process')
dst_dir = os.path.join(os.curdir, 'processed')
# Regex to find emails
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
"{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))
# Helper Functions
def copy_rename(old_file_name, new_file_name):
src_file = os.path.join(src_dir, old_file_name)
shutil.copy(src_file, dst_dir)
dst_file = os.path.join(dst_dir, old_file_name)
new_dst_file_name = os.path.join(dst_dir, new_file_name)
os.rename(dst_file, new_dst_file_name)
def get_emails(s):
"""Returns an iterator of matched emails found in string s."""
# Removing lines that start with '//' because the regular expression
# mistakenly matches patterns like 'http://foo@bar.com' as '//foo@bar.com'.
return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//'))
def convert_pdf_to_txt(path, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(path, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
return text
# Iterate through files in source directory/to-process
# Parse them using pdf miner
# Copy to other
def main():
i = 0
d = 0
for filename in os.listdir(src_dir):
if filename.endswith('.pdf'):
with open(os.path.join(src_dir, filename)) as f:
parsed = convert_pdf_to_txt(f.name)
emails = get_emails(parsed)
first_email = next(emails, None)
if first_email == None:
print('No email addresses found', '\t', 'skipped...')
else:
print(f.name, '\t', first_email)
print("Copying...")
old_name = os.path.basename(f.name)
new_name = first_email + '.pdf'
print(new_name)
try:
copy_rename(old_name, new_name)
except OSError as err:
print("OS error: {0}".format(err))
except ValueError:
print("Could not convert data to an integer.")
except:
print("Unexpected error:", sys.exc_info()[0])
raise
os.remove(f.name)
print("Done.")
i += 1
d += 1
print("All set!", '\t', str(i), '/', str(d), ' resumes processed')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment