Last active
November 18, 2021 22:54
-
-
Save victorres11/318a8abcca8b0027bc7614cc496ee616 to your computer and use it in GitHub Desktop.
PDF Splitter for Morning Star Reports
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyPDF2 import PdfFileReader, PdfFileMerger, PdfFileWriter | |
from datetime import datetime | |
import os, os.path | |
import errno | |
import sys | |
INTERNAL_FACTSHEET = "FS_Int" | |
MS_ESG_Report = "ESG" | |
CMP = "CMP" | |
REPORT_TYPES = [INTERNAL_FACTSHEET, MS_ESG_Report, CMP] | |
# QTR = "Q{}".format(current_quarter()) # Maybe include this in script launch args instead of calculating. | |
ISHARES_STRATEGY = "iShares" | |
# Firm Names | |
MORGAN_STANLEY = "Morgan Stanley" | |
ARK = "ARK" | |
MATTHEWS_ASIA = "Matthews Asia" | |
COHEN_STEERS = "Cohen & Steers" | |
BLACKROCK = "BlackRock" | |
AB = "AB" | |
CALVERT = "Calvert" | |
HARDING_LOEVNER = "Harding Loevner" | |
LAZARD = "Lazard" | |
POLEN = "Polen" | |
SPDR = "SPDR" | |
VANGUARD = "Vanguard" | |
VICTORYSHARES = "VictoryShares" | |
WASATCH = "Wasatch" | |
BRECKINRIDGE = "Breckinridge" | |
DOUBLE_LINE = "DoubleLine" | |
PIMCO = "PIMCO" | |
FIRM_NAMES = [ | |
MORGAN_STANLEY, | |
ARK, | |
MATTHEWS_ASIA, | |
COHEN_STEERS, | |
BLACKROCK, | |
COHEN_STEERS, | |
BLACKROCK, | |
AB, | |
CALVERT, | |
HARDING_LOEVNER , | |
LAZARD, | |
POLEN, | |
SPDR, | |
VANGUARD, | |
VICTORYSHARES, | |
WASATCH, | |
BRECKINRIDGE, | |
DOUBLE_LINE, | |
PIMCO, | |
] | |
# Starting at 0 | |
INTERNAL_FACTSHEET_PAGES = range(0, 2) | |
SUSTAINABILITY_FACTSHEET_PAGES = range(2, 7) | |
CM_REPORT_PAGES = range(7, 12) | |
PAGE_RANGES_BY_REPORT_TYPE = { | |
INTERNAL_FACTSHEET : INTERNAL_FACTSHEET_PAGES, | |
MS_ESG_Report : SUSTAINABILITY_FACTSHEET_PAGES, | |
CMP : CM_REPORT_PAGES | |
} | |
def current_quarter(): | |
month = datetime.now().month | |
if month in range(0,4): | |
return 1 | |
elif month in range(4, 7): | |
return 2 | |
elif month in range (7, 10): | |
return 3 | |
elif month in range (10, 13): | |
return 4 | |
def mkdir_p(path): | |
try: | |
os.makedirs(path) | |
except OSError as exc: # Python >2.5 | |
if exc.errno == errno.EEXIST and os.path.isdir(path): | |
pass | |
else: | |
raise | |
def safe_open_w(path): | |
''' Open "path" for writing, creating any parent directories as needed. | |
''' | |
mkdir_p(os.path.dirname(path)) | |
return open(path, 'wb') | |
# Breckinridge Intr Sust TaxEff # 2021.Q2 Breckinridge Interm Sustainable TaxEff - CMR.pdf | |
# Cohen & Steers Real Estate Sec # 2021.Q2 Cohen & Steers Real Estate Securities - CMR.pdf | |
# PIMCO Invst Gd Credit Bond # 2021.Q2 PIMCO Investment Grade Credit Bond Instl - CMR | |
# VictoryShares US Sm HiDiv VolWtd # 2021.Q2 VictoryShares US SmCp Hi Div Vol Wtd - Factsheet Internal.pdf | |
# SHORTENED_NAME_MAPPING = { | |
# "Breckinridge Interm Sustainable TaxEff" : "Intr Sust TaxEff", | |
# "Cohen & Steers Real Estate Securities" : "Cohen & Steers Real Estate Sec", | |
# "PIMCO Investment Grade Credit Bond Instl" : "PIMCO Invst Gd Credit Bond", | |
# "VictoryShares US SmCp Hi Div Vol Wtd" : "VictoryShares US Sm HiDiv VolWtd" | |
# } | |
FIRMNAME_MAPPINGS = { | |
MORGAN_STANLEY : "MS", | |
BRECKINRIDGE : "BRCK", | |
COHEN_STEERS : "C&S", | |
VICTORYSHARES : "VS", | |
MATTHEWS_ASIA : "MA", | |
HARDING_LOEVNER : "HL", | |
DOUBLE_LINE : "DL" | |
} | |
def generate_output_filename(strategy_name, report_type, QTR): | |
""" Generate filename used to store file. | |
""" | |
# We hit an error with lengthy filenames, so we'll shorten CMR report_type name. | |
# if report_type == CMR: | |
# report_type = "CMR" | |
for firm_name in FIRMNAME_MAPPINGS.keys(): | |
if firm_name in strategy_name: | |
strategy_name = strategy_name.replace(firm_name, FIRMNAME_MAPPINGS[firm_name]) | |
# Filename will be generated without spaces since there's a char limit. | |
output_filename = "{year}.{qtr}{strategy_name}-{report_type}.pdf".format( | |
year = datetime.now().year, | |
qtr = QTR, | |
strategy_name = strategy_name, | |
report_type = report_type | |
) | |
output_filename = output_filename.replace(" ", "") | |
max_character_count_filename = 69 | |
if len(output_filename) > max_character_count_filename: | |
outout_filename = output_filename[0:max_character_count_filename] + ".pdf" | |
return output_filename | |
def generate_dir_structure(firm_name, strategy_name, report_type): | |
""" Generate directory structure used to store file. | |
""" | |
return "Investments/Strategies/{firm_name}/{strategy_name}/Reports/{report_type}/".format( | |
firm_name = firm_name, | |
strategy_name = strategy_name, | |
report_type = report_type | |
) | |
def process_files(root_dir, QTR): | |
""" Main function that will cycle through all files, split the condensed reports into | |
their individual files, and store them at the agreed upon directory structure. | |
""" | |
# root_dir = '/Users/victortorres/projects/firecap/morningstarsplitter/2021.Q2 - Morningstar Reports KillMe/' | |
# root_dir = './2021.Q2 - Morningstar Reports - KillMe/' | |
files_in_dir = os.listdir(root_dir) | |
print("HEY!") | |
print(files_in_dir) | |
# For every FULL DECK file in the folder | |
for filename in files_in_dir: | |
# Create a report for each type of report. | |
print("-----------------------------------------------------------------------------") | |
print("Starting splitting processing for {}".format(filename)) | |
print("root_dir = {}, filename = {}".format(root_dir, filename)) | |
input_pdf = PdfFileReader("{}{}".format(root_dir, filename)) | |
strategy_name = filename.split(' - ')[0] | |
# Detect firm_name from the known filename structures. | |
for name in FIRM_NAMES: | |
strategy_name_upper = strategy_name.upper() | |
if strategy_name_upper.startswith(ISHARES_STRATEGY.upper()): | |
firm_name = BLACKROCK | |
break | |
if strategy_name_upper.startswith(name.upper()): | |
firm_name = name | |
break | |
for report_type in REPORT_TYPES: | |
# Instantiate file per report | |
output = PdfFileWriter() | |
output_filename = generate_output_filename(strategy_name, report_type, QTR) | |
dir_structure = generate_dir_structure(firm_name, strategy_name, report_type) | |
for page_num in PAGE_RANGES_BY_REPORT_TYPE[report_type]: | |
output.addPage(input_pdf.getPage(page_num)) | |
with safe_open_w("./FCM/{}/{}".format(dir_structure, output_filename)) as output_stream: | |
output.write(output_stream) | |
print("Report type {} should be complete at {}{}.... ".format(report_type, dir_structure, output_filename)) | |
# Investments > Strategies > ARK > ARKK > Reports > ESG Report | |
# Investments > Strategies > {Firm Name} > {Strategy Name} > Reports > {Report Type} | |
# When zipping these files they usually have hidden .DS_STORE and .___MACOSX files that create problems for users with PC's | |
# You can Zip and then delete these annoying files | |
# zip -d data.zip "__MACOSX/*" | |
# zip -d data.zip "*/.DS_Store" | |
# https://perishablepress.com/remove-macosx-ds-store-zip-files-mac/ | |
if __name__ == "__main__": | |
root_dir = input("What is the source path that holds on all the files?: ") | |
qtr = input("Which quarter is this for? i.e. 'Q2': ") | |
process_files(root_dir, qtr) | |
print("Script is complete!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment