anmolgarg/build_fsm.py

## build_fsm.py
# standard
from pathlib import Path
import re

# third party
import pandas as pd
import PyPDF2

FSM_PATH = f"/Users/anmolgarg/Documents/fsm/2000_FSM/Repair Manual"
SAVE_PATH = FSM_PATH


def build_pdfs_data():
    paths = list(Path(FSM_PATH).rglob("*.pdf"))
    # Remove vti_cnf paths as well as already merged files
    paths = [path for path in paths if "vti_cnf" not in path.as_posix()]
    paths = sorted([path for path in paths if "__" not in path.as_posix()])

    df = pd.DataFrame(paths, columns=["filepath"])
    df["section_title"] = df.filepath.apply(lambda path: path.as_posix().split("/")[7])
    # Use regex to find each pdf's section and page number
    df["section_page"] = df.filepath.apply(lambda path: get_page_number(path))
    df["section"] = df.section_page.apply(lambda page_no: page_no.split("±")[0])
    df["page"] = df.section_page.apply(lambda page_no: page_no.split("±")[1]).astype(int)

    # Remove unneeded transmission
    df = df.loc[["A43D" not in path.as_posix() for path in df.filepath]]

    return df


def get_page_number(pdf):
    page0 = PyPDF2.PdfFileReader(pdf.as_posix()).getPage(0)
    text = page0.extractText()
    header = text.split("2000 TOYOTA")[0]
    return re.findall("[A-Z]{2}±\d{1,3}", header)[-1]


def reorder_by_section(df):
    # using ToC here https://www.factoryrepairmanuals.com/2000-toyota-tacoma-factory-service-manual-set-original-shop-repair/
    section_order = [
        "IN",
        "MA",
        "PP",
        "SS",
        "DI",
        "EM",
        "EC",
        "SF",
        "CO",
        "LU",
        "IG",
        "ST",
        "CH",
        "AT",
        "TR",
        "PR",
        "SA",
        "BR",
        "SR",
        "RS",
        "BE",
        "BO",
        "AC",
    ]
    assert len(section_order) == len(df.section.unique())

    df["section_index"] = 99
    for ix, section in enumerate(section_order):
        df.loc[df.section == section, "section_index"] = ix
    assert df.loc[df.section_index == 99].shape[0] == 0

    # order by section index and page number
    return df.sort_values(["section_index", "page"]).reset_index(drop=True)


def write(df):
    # write full repair manual
    merger = PyPDF2.PdfFileMerger()
    [merger.append(open(pdf, "rb")) for pdf in df.filepath]
    print("writing full repair manual")
    with open(f"{SAVE_PATH}/__repair_manual.pdf", "wb") as new_file:
        merger.write(new_file)

    # write per system repair manual
    print("writing per system repair manual")
    for section in df.section_title.unique():
        df_section = df.loc[df.section_title == section]
        merger = PyPDF2.PdfFileMerger()
        [merger.append(open(pdf, "rb")) for pdf in df_section.filepath]

        section_save_path = f"{SAVE_PATH}/{section}/__{section.lower().replace(' ', '_')}.pdf"
        with open(section_save_path, "wb") as new_file:
            merger.write(new_file)

    return


def main():
    df = build_pdfs_data()
    df = reorder_by_section(df)
    write(df)


if __name__ == main():
    main()
	# standard
	from pathlib import Path
	import re

	# third party
	import pandas as pd
	import PyPDF2

	FSM_PATH = f"/Users/anmolgarg/Documents/fsm/2000_FSM/Repair Manual"
	SAVE_PATH = FSM_PATH


	def build_pdfs_data():
	paths = list(Path(FSM_PATH).rglob("*.pdf"))
	# Remove vti_cnf paths as well as already merged files
	paths = [path for path in paths if "vti_cnf" not in path.as_posix()]
	paths = sorted([path for path in paths if "__" not in path.as_posix()])

	df = pd.DataFrame(paths, columns=["filepath"])
	df["section_title"] = df.filepath.apply(lambda path: path.as_posix().split("/")[7])
	# Use regex to find each pdf's section and page number
	df["section_page"] = df.filepath.apply(lambda path: get_page_number(path))
	df["section"] = df.section_page.apply(lambda page_no: page_no.split("±")[0])
	df["page"] = df.section_page.apply(lambda page_no: page_no.split("±")[1]).astype(int)

	# Remove unneeded transmission
	df = df.loc[["A43D" not in path.as_posix() for path in df.filepath]]

	return df


	def get_page_number(pdf):
	page0 = PyPDF2.PdfFileReader(pdf.as_posix()).getPage(0)
	text = page0.extractText()
	header = text.split("2000 TOYOTA")[0]
	return re.findall("[A-Z]{2}±\d{1,3}", header)[-1]


	def reorder_by_section(df):
	# using ToC here https://www.factoryrepairmanuals.com/2000-toyota-tacoma-factory-service-manual-set-original-shop-repair/
	section_order = [
	"IN",
	"MA",
	"PP",
	"SS",
	"DI",
	"EM",
	"EC",
	"SF",
	"CO",
	"LU",
	"IG",
	"ST",
	"CH",
	"AT",
	"TR",
	"PR",
	"SA",
	"BR",
	"SR",
	"RS",
	"BE",
	"BO",
	"AC",
	]
	assert len(section_order) == len(df.section.unique())

	df["section_index"] = 99
	for ix, section in enumerate(section_order):
	df.loc[df.section == section, "section_index"] = ix
	assert df.loc[df.section_index == 99].shape[0] == 0

	# order by section index and page number
	return df.sort_values(["section_index", "page"]).reset_index(drop=True)


	def write(df):
	# write full repair manual
	merger = PyPDF2.PdfFileMerger()
	[merger.append(open(pdf, "rb")) for pdf in df.filepath]
	print("writing full repair manual")
	with open(f"{SAVE_PATH}/__repair_manual.pdf", "wb") as new_file:
	merger.write(new_file)

	# write per system repair manual
	print("writing per system repair manual")
	for section in df.section_title.unique():
	df_section = df.loc[df.section_title == section]
	merger = PyPDF2.PdfFileMerger()
	[merger.append(open(pdf, "rb")) for pdf in df_section.filepath]

	section_save_path = f"{SAVE_PATH}/{section}/__{section.lower().replace(' ', '_')}.pdf"
	with open(section_save_path, "wb") as new_file:
	merger.write(new_file)

	return


	def main():
	df = build_pdfs_data()
	df = reorder_by_section(df)
	write(df)


	if __name__ == main():
	main()