Skip to content

Instantly share code, notes, and snippets.

@victorres11
Last active November 18, 2021 22:54
Show Gist options
  • Save victorres11/318a8abcca8b0027bc7614cc496ee616 to your computer and use it in GitHub Desktop.
Save victorres11/318a8abcca8b0027bc7614cc496ee616 to your computer and use it in GitHub Desktop.
PDF Splitter for Morning Star Reports
from PyPDF2 import PdfFileReader, PdfFileMerger, PdfFileWriter
from datetime import datetime
import os, os.path
import errno
import sys
INTERNAL_FACTSHEET = "FS_Int"
MS_ESG_Report = "ESG"
CMP = "CMP"
REPORT_TYPES = [INTERNAL_FACTSHEET, MS_ESG_Report, CMP]
# QTR = "Q{}".format(current_quarter()) # Maybe include this in script launch args instead of calculating.
ISHARES_STRATEGY = "iShares"
# Firm Names
MORGAN_STANLEY = "Morgan Stanley"
ARK = "ARK"
MATTHEWS_ASIA = "Matthews Asia"
COHEN_STEERS = "Cohen & Steers"
BLACKROCK = "BlackRock"
AB = "AB"
CALVERT = "Calvert"
HARDING_LOEVNER = "Harding Loevner"
LAZARD = "Lazard"
POLEN = "Polen"
SPDR = "SPDR"
VANGUARD = "Vanguard"
VICTORYSHARES = "VictoryShares"
WASATCH = "Wasatch"
BRECKINRIDGE = "Breckinridge"
DOUBLE_LINE = "DoubleLine"
PIMCO = "PIMCO"
FIRM_NAMES = [
MORGAN_STANLEY,
ARK,
MATTHEWS_ASIA,
COHEN_STEERS,
BLACKROCK,
COHEN_STEERS,
BLACKROCK,
AB,
CALVERT,
HARDING_LOEVNER ,
LAZARD,
POLEN,
SPDR,
VANGUARD,
VICTORYSHARES,
WASATCH,
BRECKINRIDGE,
DOUBLE_LINE,
PIMCO,
]
# Starting at 0
INTERNAL_FACTSHEET_PAGES = range(0, 2)
SUSTAINABILITY_FACTSHEET_PAGES = range(2, 7)
CM_REPORT_PAGES = range(7, 12)
PAGE_RANGES_BY_REPORT_TYPE = {
INTERNAL_FACTSHEET : INTERNAL_FACTSHEET_PAGES,
MS_ESG_Report : SUSTAINABILITY_FACTSHEET_PAGES,
CMP : CM_REPORT_PAGES
}
def current_quarter():
month = datetime.now().month
if month in range(0,4):
return 1
elif month in range(4, 7):
return 2
elif month in range (7, 10):
return 3
elif month in range (10, 13):
return 4
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def safe_open_w(path):
''' Open "path" for writing, creating any parent directories as needed.
'''
mkdir_p(os.path.dirname(path))
return open(path, 'wb')
# Breckinridge Intr Sust TaxEff # 2021.Q2 Breckinridge Interm Sustainable TaxEff - CMR.pdf
# Cohen & Steers Real Estate Sec # 2021.Q2 Cohen & Steers Real Estate Securities - CMR.pdf
# PIMCO Invst Gd Credit Bond # 2021.Q2 PIMCO Investment Grade Credit Bond Instl - CMR
# VictoryShares US Sm HiDiv VolWtd # 2021.Q2 VictoryShares US SmCp Hi Div Vol Wtd - Factsheet Internal.pdf
# SHORTENED_NAME_MAPPING = {
# "Breckinridge Interm Sustainable TaxEff" : "Intr Sust TaxEff",
# "Cohen & Steers Real Estate Securities" : "Cohen & Steers Real Estate Sec",
# "PIMCO Investment Grade Credit Bond Instl" : "PIMCO Invst Gd Credit Bond",
# "VictoryShares US SmCp Hi Div Vol Wtd" : "VictoryShares US Sm HiDiv VolWtd"
# }
FIRMNAME_MAPPINGS = {
MORGAN_STANLEY : "MS",
BRECKINRIDGE : "BRCK",
COHEN_STEERS : "C&S",
VICTORYSHARES : "VS",
MATTHEWS_ASIA : "MA",
HARDING_LOEVNER : "HL",
DOUBLE_LINE : "DL"
}
def generate_output_filename(strategy_name, report_type, QTR):
""" Generate filename used to store file.
"""
# We hit an error with lengthy filenames, so we'll shorten CMR report_type name.
# if report_type == CMR:
# report_type = "CMR"
for firm_name in FIRMNAME_MAPPINGS.keys():
if firm_name in strategy_name:
strategy_name = strategy_name.replace(firm_name, FIRMNAME_MAPPINGS[firm_name])
# Filename will be generated without spaces since there's a char limit.
output_filename = "{year}.{qtr}{strategy_name}-{report_type}.pdf".format(
year = datetime.now().year,
qtr = QTR,
strategy_name = strategy_name,
report_type = report_type
)
output_filename = output_filename.replace(" ", "")
max_character_count_filename = 69
if len(output_filename) > max_character_count_filename:
outout_filename = output_filename[0:max_character_count_filename] + ".pdf"
return output_filename
def generate_dir_structure(firm_name, strategy_name, report_type):
""" Generate directory structure used to store file.
"""
return "Investments/Strategies/{firm_name}/{strategy_name}/Reports/{report_type}/".format(
firm_name = firm_name,
strategy_name = strategy_name,
report_type = report_type
)
def process_files(root_dir, QTR):
""" Main function that will cycle through all files, split the condensed reports into
their individual files, and store them at the agreed upon directory structure.
"""
# root_dir = '/Users/victortorres/projects/firecap/morningstarsplitter/2021.Q2 - Morningstar Reports KillMe/'
# root_dir = './2021.Q2 - Morningstar Reports - KillMe/'
files_in_dir = os.listdir(root_dir)
print("HEY!")
print(files_in_dir)
# For every FULL DECK file in the folder
for filename in files_in_dir:
# Create a report for each type of report.
print("-----------------------------------------------------------------------------")
print("Starting splitting processing for {}".format(filename))
print("root_dir = {}, filename = {}".format(root_dir, filename))
input_pdf = PdfFileReader("{}{}".format(root_dir, filename))
strategy_name = filename.split(' - ')[0]
# Detect firm_name from the known filename structures.
for name in FIRM_NAMES:
strategy_name_upper = strategy_name.upper()
if strategy_name_upper.startswith(ISHARES_STRATEGY.upper()):
firm_name = BLACKROCK
break
if strategy_name_upper.startswith(name.upper()):
firm_name = name
break
for report_type in REPORT_TYPES:
# Instantiate file per report
output = PdfFileWriter()
output_filename = generate_output_filename(strategy_name, report_type, QTR)
dir_structure = generate_dir_structure(firm_name, strategy_name, report_type)
for page_num in PAGE_RANGES_BY_REPORT_TYPE[report_type]:
output.addPage(input_pdf.getPage(page_num))
with safe_open_w("./FCM/{}/{}".format(dir_structure, output_filename)) as output_stream:
output.write(output_stream)
print("Report type {} should be complete at {}{}.... ".format(report_type, dir_structure, output_filename))
# Investments > Strategies > ARK > ARKK > Reports > ESG Report
# Investments > Strategies > {Firm Name} > {Strategy Name} > Reports > {Report Type}
# When zipping these files they usually have hidden .DS_STORE and .___MACOSX files that create problems for users with PC's
# You can Zip and then delete these annoying files
# zip -d data.zip "__MACOSX/*"
# zip -d data.zip "*/.DS_Store"
# https://perishablepress.com/remove-macosx-ds-store-zip-files-mac/
if __name__ == "__main__":
root_dir = input("What is the source path that holds on all the files?: ")
qtr = input("Which quarter is this for? i.e. 'Q2': ")
process_files(root_dir, qtr)
print("Script is complete!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment