Created
August 26, 2021 21:31
-
-
Save anmolgarg/c12ad5c6bad1797f97ecee4d16e4614a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# standard | |
from pathlib import Path | |
import re | |
# third party | |
import pandas as pd | |
import PyPDF2 | |
FSM_PATH = f"/Users/anmolgarg/Documents/fsm/2000_FSM/Repair Manual" | |
SAVE_PATH = FSM_PATH | |
def build_pdfs_data(): | |
paths = list(Path(FSM_PATH).rglob("*.pdf")) | |
# Remove vti_cnf paths as well as already merged files | |
paths = [path for path in paths if "vti_cnf" not in path.as_posix()] | |
paths = sorted([path for path in paths if "__" not in path.as_posix()]) | |
df = pd.DataFrame(paths, columns=["filepath"]) | |
df["section_title"] = df.filepath.apply(lambda path: path.as_posix().split("/")[7]) | |
# Use regex to find each pdf's section and page number | |
df["section_page"] = df.filepath.apply(lambda path: get_page_number(path)) | |
df["section"] = df.section_page.apply(lambda page_no: page_no.split("±")[0]) | |
df["page"] = df.section_page.apply(lambda page_no: page_no.split("±")[1]).astype(int) | |
# Remove unneeded transmission | |
df = df.loc[["A43D" not in path.as_posix() for path in df.filepath]] | |
return df | |
def get_page_number(pdf): | |
page0 = PyPDF2.PdfFileReader(pdf.as_posix()).getPage(0) | |
text = page0.extractText() | |
header = text.split("2000 TOYOTA")[0] | |
return re.findall("[A-Z]{2}±\d{1,3}", header)[-1] | |
def reorder_by_section(df): | |
# using ToC here https://www.factoryrepairmanuals.com/2000-toyota-tacoma-factory-service-manual-set-original-shop-repair/ | |
section_order = [ | |
"IN", | |
"MA", | |
"PP", | |
"SS", | |
"DI", | |
"EM", | |
"EC", | |
"SF", | |
"CO", | |
"LU", | |
"IG", | |
"ST", | |
"CH", | |
"AT", | |
"TR", | |
"PR", | |
"SA", | |
"BR", | |
"SR", | |
"RS", | |
"BE", | |
"BO", | |
"AC", | |
] | |
assert len(section_order) == len(df.section.unique()) | |
df["section_index"] = 99 | |
for ix, section in enumerate(section_order): | |
df.loc[df.section == section, "section_index"] = ix | |
assert df.loc[df.section_index == 99].shape[0] == 0 | |
# order by section index and page number | |
return df.sort_values(["section_index", "page"]).reset_index(drop=True) | |
def write(df): | |
# write full repair manual | |
merger = PyPDF2.PdfFileMerger() | |
[merger.append(open(pdf, "rb")) for pdf in df.filepath] | |
print("writing full repair manual") | |
with open(f"{SAVE_PATH}/__repair_manual.pdf", "wb") as new_file: | |
merger.write(new_file) | |
# write per system repair manual | |
print("writing per system repair manual") | |
for section in df.section_title.unique(): | |
df_section = df.loc[df.section_title == section] | |
merger = PyPDF2.PdfFileMerger() | |
[merger.append(open(pdf, "rb")) for pdf in df_section.filepath] | |
section_save_path = f"{SAVE_PATH}/{section}/__{section.lower().replace(' ', '_')}.pdf" | |
with open(section_save_path, "wb") as new_file: | |
merger.write(new_file) | |
return | |
def main(): | |
df = build_pdfs_data() | |
df = reorder_by_section(df) | |
write(df) | |
if __name__ == main(): | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment