Skip to content

Instantly share code, notes, and snippets.

@anmolgarg
Created August 26, 2021 21:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anmolgarg/c12ad5c6bad1797f97ecee4d16e4614a to your computer and use it in GitHub Desktop.
Save anmolgarg/c12ad5c6bad1797f97ecee4d16e4614a to your computer and use it in GitHub Desktop.
# standard
from pathlib import Path
import re
# third party
import pandas as pd
import PyPDF2
FSM_PATH = f"/Users/anmolgarg/Documents/fsm/2000_FSM/Repair Manual"
SAVE_PATH = FSM_PATH
def build_pdfs_data():
paths = list(Path(FSM_PATH).rglob("*.pdf"))
# Remove vti_cnf paths as well as already merged files
paths = [path for path in paths if "vti_cnf" not in path.as_posix()]
paths = sorted([path for path in paths if "__" not in path.as_posix()])
df = pd.DataFrame(paths, columns=["filepath"])
df["section_title"] = df.filepath.apply(lambda path: path.as_posix().split("/")[7])
# Use regex to find each pdf's section and page number
df["section_page"] = df.filepath.apply(lambda path: get_page_number(path))
df["section"] = df.section_page.apply(lambda page_no: page_no.split("±")[0])
df["page"] = df.section_page.apply(lambda page_no: page_no.split("±")[1]).astype(int)
# Remove unneeded transmission
df = df.loc[["A43D" not in path.as_posix() for path in df.filepath]]
return df
def get_page_number(pdf):
page0 = PyPDF2.PdfFileReader(pdf.as_posix()).getPage(0)
text = page0.extractText()
header = text.split("2000 TOYOTA")[0]
return re.findall("[A-Z]{2}±\d{1,3}", header)[-1]
def reorder_by_section(df):
# using ToC here https://www.factoryrepairmanuals.com/2000-toyota-tacoma-factory-service-manual-set-original-shop-repair/
section_order = [
"IN",
"MA",
"PP",
"SS",
"DI",
"EM",
"EC",
"SF",
"CO",
"LU",
"IG",
"ST",
"CH",
"AT",
"TR",
"PR",
"SA",
"BR",
"SR",
"RS",
"BE",
"BO",
"AC",
]
assert len(section_order) == len(df.section.unique())
df["section_index"] = 99
for ix, section in enumerate(section_order):
df.loc[df.section == section, "section_index"] = ix
assert df.loc[df.section_index == 99].shape[0] == 0
# order by section index and page number
return df.sort_values(["section_index", "page"]).reset_index(drop=True)
def write(df):
# write full repair manual
merger = PyPDF2.PdfFileMerger()
[merger.append(open(pdf, "rb")) for pdf in df.filepath]
print("writing full repair manual")
with open(f"{SAVE_PATH}/__repair_manual.pdf", "wb") as new_file:
merger.write(new_file)
# write per system repair manual
print("writing per system repair manual")
for section in df.section_title.unique():
df_section = df.loc[df.section_title == section]
merger = PyPDF2.PdfFileMerger()
[merger.append(open(pdf, "rb")) for pdf in df_section.filepath]
section_save_path = f"{SAVE_PATH}/{section}/__{section.lower().replace(' ', '_')}.pdf"
with open(section_save_path, "wb") as new_file:
merger.write(new_file)
return
def main():
df = build_pdfs_data()
df = reorder_by_section(df)
write(df)
if __name__ == main():
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment