pjatx/renameResumes.py

## renameResumes.py
# Import global broad stuff
import os
import os.path
import shutil
import re
from optparse import OptionParser

# Import PDF Miner specific stuff to use as library
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

# Global Variables
src_dir = os.path.join(os.curdir, 'to-process')
dst_dir = os.path.join(os.curdir, 'processed')

# Regex to find emails
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
                    "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
                    "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))

# Helper Functions
def copy_rename(old_file_name, new_file_name):
  src_file = os.path.join(src_dir, old_file_name)
  shutil.copy(src_file, dst_dir)

  dst_file = os.path.join(dst_dir, old_file_name)
  new_dst_file_name = os.path.join(dst_dir, new_file_name)
  os.rename(dst_file, new_dst_file_name)


def get_emails(s):
  """Returns an iterator of matched emails found in string s."""
  # Removing lines that start with '//' because the regular expression
  # mistakenly matches patterns like 'http://foo@bar.com' as '//foo@bar.com'.
  return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//'))


def convert_pdf_to_txt(path, pages=None):
  if not pages:
    pagenums = set()
  else:
    pagenums = set(pages)
  output = StringIO()
  manager = PDFResourceManager()
  converter = TextConverter(manager, output, laparams=LAParams())
  interpreter = PDFPageInterpreter(manager, converter)

  infile = open(path, 'rb')
  for page in PDFPage.get_pages(infile, pagenums):
    interpreter.process_page(page)
  infile.close()
  converter.close()
  text = output.getvalue()
  output.close()
  return text

# Iterate through files in source directory/to-process
# Parse them using pdf miner
# Copy to other
def main():

  i = 0
  d = 0

  for filename in os.listdir(src_dir):
    if filename.endswith('.pdf'):
      with open(os.path.join(src_dir, filename)) as f:

        parsed = convert_pdf_to_txt(f.name)
        emails = get_emails(parsed)

        first_email = next(emails, None)

        if first_email == None:
          print('No email addresses found', '\t', 'skipped...')

        else:

          print(f.name, '\t', first_email)
          print("Copying...")

          old_name = os.path.basename(f.name)
          new_name = first_email + '.pdf'
          print(new_name)

          try:
            copy_rename(old_name, new_name)
          except OSError as err:
            print("OS error: {0}".format(err))
          except ValueError:
            print("Could not convert data to an integer.")
          except:
            print("Unexpected error:", sys.exc_info()[0])
            raise

          os.remove(f.name)

          print("Done.")
          i += 1

      d += 1

  print("All set!", '\t', str(i), '/', str(d), ' resumes processed')

if __name__ == '__main__':
  main()
	# Import global broad stuff
	import os
	import os.path
	import shutil
	import re
	from optparse import OptionParser

	# Import PDF Miner specific stuff to use as library
	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
	from pdfminer.converter import TextConverter
	from pdfminer.layout import LAParams
	from pdfminer.pdfpage import PDFPage
	from io import StringIO

	# Global Variables
	src_dir = os.path.join(os.curdir, 'to-process')
	dst_dir = os.path.join(os.curdir, 'processed')

	# Regex to find emails
	regex = re.compile(("([a-z0-9!#$%&'+\/=?^_`{\|}~-]+(?:\.[a-z0-9!#$%&'+\/=?^_`"
	"{\|}~-]+)(@\|\sat\s)(?:[a-z0-9](?:[a-z0-9-][a-z0-9])?(\.\|"
	"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))

	# Helper Functions
	def copy_rename(old_file_name, new_file_name):
	src_file = os.path.join(src_dir, old_file_name)
	shutil.copy(src_file, dst_dir)

	dst_file = os.path.join(dst_dir, old_file_name)
	new_dst_file_name = os.path.join(dst_dir, new_file_name)
	os.rename(dst_file, new_dst_file_name)


	def get_emails(s):
	"""Returns an iterator of matched emails found in string s."""
	# Removing lines that start with '//' because the regular expression
	# mistakenly matches patterns like 'http://foo@bar.com' as '//foo@bar.com'.
	return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//'))


	def convert_pdf_to_txt(path, pages=None):
	if not pages:
	pagenums = set()
	else:
	pagenums = set(pages)
	output = StringIO()
	manager = PDFResourceManager()
	converter = TextConverter(manager, output, laparams=LAParams())
	interpreter = PDFPageInterpreter(manager, converter)

	infile = open(path, 'rb')
	for page in PDFPage.get_pages(infile, pagenums):
	interpreter.process_page(page)
	infile.close()
	converter.close()
	text = output.getvalue()
	output.close()
	return text

	# Iterate through files in source directory/to-process
	# Parse them using pdf miner
	# Copy to other
	def main():

	i = 0
	d = 0

	for filename in os.listdir(src_dir):
	if filename.endswith('.pdf'):
	with open(os.path.join(src_dir, filename)) as f:

	parsed = convert_pdf_to_txt(f.name)
	emails = get_emails(parsed)

	first_email = next(emails, None)

	if first_email == None:
	print('No email addresses found', '\t', 'skipped...')

	else:

	print(f.name, '\t', first_email)
	print("Copying...")

	old_name = os.path.basename(f.name)
	new_name = first_email + '.pdf'
	print(new_name)

	try:
	copy_rename(old_name, new_name)
	except OSError as err:
	print("OS error: {0}".format(err))
	except ValueError:
	print("Could not convert data to an integer.")
	except:
	print("Unexpected error:", sys.exc_info()[0])
	raise

	os.remove(f.name)

	print("Done.")
	i += 1

	d += 1

	print("All set!", '\t', str(i), '/', str(d), ' resumes processed')

	if __name__ == '__main__':
	main()