victorres11/pdfsplitter.py

## pdfsplitter.py
from PyPDF2 import PdfFileReader, PdfFileMerger, PdfFileWriter
from datetime import datetime
import os, os.path
import errno
import sys

INTERNAL_FACTSHEET = "FS_Int"
MS_ESG_Report      = "ESG"
CMP                = "CMP"
REPORT_TYPES = [INTERNAL_FACTSHEET, MS_ESG_Report, CMP]

# QTR = "Q{}".format(current_quarter()) # Maybe include this in script launch args instead of calculating.
ISHARES_STRATEGY = "iShares"

# Firm Names
MORGAN_STANLEY  = "Morgan Stanley"
ARK             = "ARK"
MATTHEWS_ASIA   = "Matthews Asia"
COHEN_STEERS    = "Cohen & Steers"
BLACKROCK       = "BlackRock"
AB              = "AB"
CALVERT         = "Calvert"
HARDING_LOEVNER = "Harding Loevner"
LAZARD          = "Lazard"
POLEN           = "Polen"
SPDR            = "SPDR"
VANGUARD        = "Vanguard"
VICTORYSHARES   = "VictoryShares"
WASATCH         = "Wasatch"
BRECKINRIDGE    = "Breckinridge"
DOUBLE_LINE     = "DoubleLine"
PIMCO           = "PIMCO"

FIRM_NAMES = [
    MORGAN_STANLEY,
    ARK,
    MATTHEWS_ASIA,
    COHEN_STEERS,
    BLACKROCK,
    COHEN_STEERS,
    BLACKROCK,
    AB,
    CALVERT,
    HARDING_LOEVNER ,
    LAZARD,
    POLEN,
    SPDR,
    VANGUARD,
    VICTORYSHARES,
    WASATCH,
    BRECKINRIDGE,
    DOUBLE_LINE,
    PIMCO,
    ]

# Starting at 0
INTERNAL_FACTSHEET_PAGES       = range(0, 2)
SUSTAINABILITY_FACTSHEET_PAGES = range(2, 7)
CM_REPORT_PAGES                = range(7, 12)

PAGE_RANGES_BY_REPORT_TYPE = {
    INTERNAL_FACTSHEET : INTERNAL_FACTSHEET_PAGES,
    MS_ESG_Report      : SUSTAINABILITY_FACTSHEET_PAGES,
    CMP                : CM_REPORT_PAGES
}

def current_quarter():
    month = datetime.now().month
    if month in range(0,4):
        return 1
    elif month in range(4, 7):
        return 2
    elif month in range (7, 10):
        return 3
    elif month in range (10, 13):
        return 4

def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def safe_open_w(path):
    ''' Open "path" for writing, creating any parent directories as needed.
    '''
    mkdir_p(os.path.dirname(path))
    return open(path, 'wb')

# Breckinridge Intr Sust TaxEff         # 2021.Q2 Breckinridge Interm Sustainable TaxEff - CMR.pdf
# Cohen & Steers Real Estate Sec        # 2021.Q2 Cohen & Steers Real Estate Securities - CMR.pdf
# PIMCO Invst Gd Credit Bond            # 2021.Q2 PIMCO Investment Grade Credit Bond Instl - CMR
# VictoryShares US Sm HiDiv VolWtd      # 2021.Q2 VictoryShares US SmCp Hi Div Vol Wtd - Factsheet Internal.pdf

# SHORTENED_NAME_MAPPING = {
#     "Breckinridge Interm Sustainable TaxEff" : "Intr Sust TaxEff",
#     "Cohen & Steers Real Estate Securities" : "Cohen & Steers Real Estate Sec",
#     "PIMCO Investment Grade Credit Bond Instl" : "PIMCO Invst Gd Credit Bond",
#     "VictoryShares US SmCp Hi Div Vol Wtd" : "VictoryShares US Sm HiDiv VolWtd"
# }

FIRMNAME_MAPPINGS = {
    MORGAN_STANLEY  : "MS",
    BRECKINRIDGE    : "BRCK",
    COHEN_STEERS    : "C&S",
    VICTORYSHARES   : "VS",
    MATTHEWS_ASIA   : "MA",
    HARDING_LOEVNER : "HL",
    DOUBLE_LINE     : "DL"
}

def generate_output_filename(strategy_name, report_type, QTR):
    """ Generate filename used to store file.
    """
    # We hit an error with lengthy filenames, so we'll shorten CMR report_type name.
    # if report_type == CMR:
    #     report_type = "CMR"

    for firm_name in FIRMNAME_MAPPINGS.keys():
        if firm_name in strategy_name:
            strategy_name = strategy_name.replace(firm_name, FIRMNAME_MAPPINGS[firm_name])


    # Filename will be generated without spaces since there's a char limit.
    output_filename = "{year}.{qtr}{strategy_name}-{report_type}.pdf".format(
            year          = datetime.now().year,
            qtr           = QTR,
            strategy_name = strategy_name,
            report_type   = report_type
            )

    output_filename = output_filename.replace(" ", "")

    max_character_count_filename = 69
    if len(output_filename) > max_character_count_filename:
        outout_filename = output_filename[0:max_character_count_filename] + ".pdf"

    return output_filename

def generate_dir_structure(firm_name, strategy_name, report_type):
    """ Generate directory structure used to store file.
    """
    return "Investments/Strategies/{firm_name}/{strategy_name}/Reports/{report_type}/".format(
            firm_name       = firm_name,
            strategy_name   = strategy_name,
            report_type     = report_type
        )

def process_files(root_dir, QTR):
    """ Main function that will cycle through all files, split the condensed reports into
        their individual files, and store them at the agreed upon directory structure.
    """
    # root_dir = '/Users/victortorres/projects/firecap/morningstarsplitter/2021.Q2 - Morningstar Reports KillMe/'
    # root_dir = './2021.Q2 - Morningstar Reports - KillMe/'
    files_in_dir = os.listdir(root_dir)
    print("HEY!")
    print(files_in_dir)

    # For every FULL DECK file in the folder
    for filename in files_in_dir:
        # Create a report for each type of report.
        print("-----------------------------------------------------------------------------")
        print("Starting splitting processing for {}".format(filename))
        print("root_dir = {}, filename = {}".format(root_dir, filename))
        input_pdf = PdfFileReader("{}{}".format(root_dir, filename))
        strategy_name = filename.split(' - ')[0]

        # Detect firm_name from the known filename structures.
        for name in FIRM_NAMES:
            strategy_name_upper = strategy_name.upper()
            if strategy_name_upper.startswith(ISHARES_STRATEGY.upper()):
                firm_name = BLACKROCK
                break
            if strategy_name_upper.startswith(name.upper()):
                firm_name = name
                break

        for report_type in REPORT_TYPES:
            # Instantiate file per report
            output = PdfFileWriter()
            output_filename = generate_output_filename(strategy_name, report_type, QTR)
            dir_structure   = generate_dir_structure(firm_name, strategy_name, report_type)
            for page_num in PAGE_RANGES_BY_REPORT_TYPE[report_type]:
                output.addPage(input_pdf.getPage(page_num))
                with safe_open_w("./FCM/{}/{}".format(dir_structure, output_filename)) as output_stream:
                    output.write(output_stream)
            print("Report type {} should be complete at {}{}.... ".format(report_type, dir_structure, output_filename))


# Investments > Strategies > ARK > ARKK > Reports > ESG Report
# Investments > Strategies > {Firm Name} > {Strategy Name} > Reports > {Report Type}

# When zipping these files they usually have hidden .DS_STORE and .___MACOSX files that create problems for users with PC's
# You can Zip and then delete these annoying files
# zip -d data.zip "__MACOSX/*"
# zip -d data.zip "*/.DS_Store"
# https://perishablepress.com/remove-macosx-ds-store-zip-files-mac/

if __name__ == "__main__":
    root_dir = input("What is the source path that holds on all the files?: ")
    qtr = input("Which quarter is this for? i.e. 'Q2': ")
    process_files(root_dir, qtr)
    print("Script is complete!")
	from PyPDF2 import PdfFileReader, PdfFileMerger, PdfFileWriter
	from datetime import datetime
	import os, os.path
	import errno
	import sys

	INTERNAL_FACTSHEET = "FS_Int"
	MS_ESG_Report = "ESG"
	CMP = "CMP"
	REPORT_TYPES = [INTERNAL_FACTSHEET, MS_ESG_Report, CMP]

	# QTR = "Q{}".format(current_quarter()) # Maybe include this in script launch args instead of calculating.
	ISHARES_STRATEGY = "iShares"

	# Firm Names
	MORGAN_STANLEY = "Morgan Stanley"
	ARK = "ARK"
	MATTHEWS_ASIA = "Matthews Asia"
	COHEN_STEERS = "Cohen & Steers"
	BLACKROCK = "BlackRock"
	AB = "AB"
	CALVERT = "Calvert"
	HARDING_LOEVNER = "Harding Loevner"
	LAZARD = "Lazard"
	POLEN = "Polen"
	SPDR = "SPDR"
	VANGUARD = "Vanguard"
	VICTORYSHARES = "VictoryShares"
	WASATCH = "Wasatch"
	BRECKINRIDGE = "Breckinridge"
	DOUBLE_LINE = "DoubleLine"
	PIMCO = "PIMCO"

	FIRM_NAMES = [
	MORGAN_STANLEY,
	ARK,
	MATTHEWS_ASIA,
	COHEN_STEERS,
	BLACKROCK,
	COHEN_STEERS,
	BLACKROCK,
	AB,
	CALVERT,
	HARDING_LOEVNER ,
	LAZARD,
	POLEN,
	SPDR,
	VANGUARD,
	VICTORYSHARES,
	WASATCH,
	BRECKINRIDGE,
	DOUBLE_LINE,
	PIMCO,
	]

	# Starting at 0
	INTERNAL_FACTSHEET_PAGES = range(0, 2)
	SUSTAINABILITY_FACTSHEET_PAGES = range(2, 7)
	CM_REPORT_PAGES = range(7, 12)

	PAGE_RANGES_BY_REPORT_TYPE = {
	INTERNAL_FACTSHEET : INTERNAL_FACTSHEET_PAGES,
	MS_ESG_Report : SUSTAINABILITY_FACTSHEET_PAGES,
	CMP : CM_REPORT_PAGES
	}

	def current_quarter():
	month = datetime.now().month
	if month in range(0,4):
	return 1
	elif month in range(4, 7):
	return 2
	elif month in range (7, 10):
	return 3
	elif month in range (10, 13):
	return 4

	def mkdir_p(path):
	try:
	os.makedirs(path)
	except OSError as exc: # Python >2.5
	if exc.errno == errno.EEXIST and os.path.isdir(path):
	pass
	else:
	raise

	def safe_open_w(path):
	''' Open "path" for writing, creating any parent directories as needed.
	'''
	mkdir_p(os.path.dirname(path))
	return open(path, 'wb')

	# Breckinridge Intr Sust TaxEff # 2021.Q2 Breckinridge Interm Sustainable TaxEff - CMR.pdf
	# Cohen & Steers Real Estate Sec # 2021.Q2 Cohen & Steers Real Estate Securities - CMR.pdf
	# PIMCO Invst Gd Credit Bond # 2021.Q2 PIMCO Investment Grade Credit Bond Instl - CMR
	# VictoryShares US Sm HiDiv VolWtd # 2021.Q2 VictoryShares US SmCp Hi Div Vol Wtd - Factsheet Internal.pdf

	# SHORTENED_NAME_MAPPING = {
	# "Breckinridge Interm Sustainable TaxEff" : "Intr Sust TaxEff",
	# "Cohen & Steers Real Estate Securities" : "Cohen & Steers Real Estate Sec",
	# "PIMCO Investment Grade Credit Bond Instl" : "PIMCO Invst Gd Credit Bond",
	# "VictoryShares US SmCp Hi Div Vol Wtd" : "VictoryShares US Sm HiDiv VolWtd"
	# }

	FIRMNAME_MAPPINGS = {
	MORGAN_STANLEY : "MS",
	BRECKINRIDGE : "BRCK",
	COHEN_STEERS : "C&S",
	VICTORYSHARES : "VS",
	MATTHEWS_ASIA : "MA",
	HARDING_LOEVNER : "HL",
	DOUBLE_LINE : "DL"
	}

	def generate_output_filename(strategy_name, report_type, QTR):
	""" Generate filename used to store file.
	"""
	# We hit an error with lengthy filenames, so we'll shorten CMR report_type name.
	# if report_type == CMR:
	# report_type = "CMR"

	for firm_name in FIRMNAME_MAPPINGS.keys():
	if firm_name in strategy_name:
	strategy_name = strategy_name.replace(firm_name, FIRMNAME_MAPPINGS[firm_name])


	# Filename will be generated without spaces since there's a char limit.
	output_filename = "{year}.{qtr}{strategy_name}-{report_type}.pdf".format(
	year = datetime.now().year,
	qtr = QTR,
	strategy_name = strategy_name,
	report_type = report_type
	)

	output_filename = output_filename.replace(" ", "")

	max_character_count_filename = 69
	if len(output_filename) > max_character_count_filename:
	outout_filename = output_filename[0:max_character_count_filename] + ".pdf"

	return output_filename

	def generate_dir_structure(firm_name, strategy_name, report_type):
	""" Generate directory structure used to store file.
	"""
	return "Investments/Strategies/{firm_name}/{strategy_name}/Reports/{report_type}/".format(
	firm_name = firm_name,
	strategy_name = strategy_name,
	report_type = report_type
	)

	def process_files(root_dir, QTR):
	""" Main function that will cycle through all files, split the condensed reports into
	their individual files, and store them at the agreed upon directory structure.
	"""
	# root_dir = '/Users/victortorres/projects/firecap/morningstarsplitter/2021.Q2 - Morningstar Reports KillMe/'
	# root_dir = './2021.Q2 - Morningstar Reports - KillMe/'
	files_in_dir = os.listdir(root_dir)
	print("HEY!")
	print(files_in_dir)

	# For every FULL DECK file in the folder
	for filename in files_in_dir:
	# Create a report for each type of report.
	print("-----------------------------------------------------------------------------")
	print("Starting splitting processing for {}".format(filename))
	print("root_dir = {}, filename = {}".format(root_dir, filename))
	input_pdf = PdfFileReader("{}{}".format(root_dir, filename))
	strategy_name = filename.split(' - ')[0]

	# Detect firm_name from the known filename structures.
	for name in FIRM_NAMES:
	strategy_name_upper = strategy_name.upper()
	if strategy_name_upper.startswith(ISHARES_STRATEGY.upper()):
	firm_name = BLACKROCK
	break
	if strategy_name_upper.startswith(name.upper()):
	firm_name = name
	break

	for report_type in REPORT_TYPES:
	# Instantiate file per report
	output = PdfFileWriter()
	output_filename = generate_output_filename(strategy_name, report_type, QTR)
	dir_structure = generate_dir_structure(firm_name, strategy_name, report_type)
	for page_num in PAGE_RANGES_BY_REPORT_TYPE[report_type]:
	output.addPage(input_pdf.getPage(page_num))
	with safe_open_w("./FCM/{}/{}".format(dir_structure, output_filename)) as output_stream:
	output.write(output_stream)
	print("Report type {} should be complete at {}{}.... ".format(report_type, dir_structure, output_filename))


	# Investments > Strategies > ARK > ARKK > Reports > ESG Report
	# Investments > Strategies > {Firm Name} > {Strategy Name} > Reports > {Report Type}

	# When zipping these files they usually have hidden .DS_STORE and .___MACOSX files that create problems for users with PC's
	# You can Zip and then delete these annoying files
	# zip -d data.zip "__MACOSX/*"
	# zip -d data.zip "*/.DS_Store"
	# https://perishablepress.com/remove-macosx-ds-store-zip-files-mac/

	if __name__ == "__main__":
	root_dir = input("What is the source path that holds on all the files?: ")
	qtr = input("Which quarter is this for? i.e. 'Q2': ")
	process_files(root_dir, qtr)
	print("Script is complete!")