duylebkHCM/book_splitter.py

## book_splitter.py
import re
import fitz
from fitz import Page
import argparse
import pandas as pd
from pathlib import Path
from collections import defaultdict


EXCLUDE_KEYWORD = [
    'Contents',
    'Organization',
    'Preface',
    'Foreword',
    'Author Index'
]
PAPER_KEYWORD_PTTN = [
    'Keywords:'
]

CLEAN_PTTN = re.compile(r'[^0-9a-zA-Z]')


def extract_keyword(first_page: Page):
    page_blocks = first_page.get_text("blocks")
    success=False
    new_lst_kw=[]
    for blk in page_blocks:
        for kw in PAPER_KEYWORD_PTTN:
            if blk[4].startswith(kw) and not success:
                lst_kw = blk[4].replace(kw, '')
                lst_kw = lst_kw.split('·')
                for k in lst_kw:
                    k = k.strip().split('\n')
                    if len(k) > 1:
                        new_lst_kw += [item.strip() for item in k]
                    else:
                        new_lst_kw += [k[0].strip()]
                success=True
        if success:
            break

    if len(new_lst_kw) > 0:
        return ','.join(new_lst_kw)
    else:
        return 'Not found'


def split_pdf_book(book_dirs, output_dir=None, pattern='*.pdf'):
    if Path(book_dirs).is_file():
        book_paths = [Path(book_dirs)]
    else:
        book_paths = list(Path(book_dirs).rglob(pattern))

    statistic_info = {
        'book_name': [],
        'chapter_title': [],
        'paper_title': [],
        'original_title':[],
        'keyword': [],
        'save_path': []
    }

    for book_path in book_paths:
        paper_dir = Path(output_dir).joinpath(book_path.stem)
        if not paper_dir.exists():
            paper_dir.mkdir(parents=True)

        doc = fitz.Document(book_path.as_posix())
        toc = doc.get_toc(simple=False)

        contents = [[int(t[0]), t[1].strip(), int(t[2])] for t in toc]

        remain_content = list(filter(lambda content: not any(content[1].__contains__(key) for key in EXCLUDE_KEYWORD), contents))
        remain_content = sorted(remain_content, key=lambda content: content[2])

        first_level_idx = [idx for idx, item in enumerate(remain_content) if item[0] == 1]

        chapter_idxs = []
        paper_idxs = []
        first_level_idx = first_level_idx + [len(remain_content)]
        rel_first_level_idx = dict([(idx, i) for idx, i in zip(first_level_idx, range(len(first_level_idx)))])

        for idx, next_idx in list(zip(first_level_idx[:-1], first_level_idx[1:])):
            if next_idx-idx == 1:
                chapter_idxs.append(idx)
            else:
                paper_idxs.append(idx)

        chapter_info = defaultdict(list)
        for idx in chapter_idxs:
            chapter_info[idx] = []

        chapter_idxs += [len(remain_content)]
        paper_idxs += [len(remain_content)]

        chpter_range = list(zip(chapter_idxs[:-1], chapter_idxs[1:]))
        paper_range = list(zip(paper_idxs[:-1], paper_idxs[1:]))

        for idx, next_idx in paper_range:
            complete=False
            for chapter_idx, next_chapter_idx in chpter_range:
                if idx > chapter_idx and idx < next_chapter_idx and not complete:
                    start_paper_idx = idx
                    if rel_first_level_idx[idx] == rel_first_level_idx[next_chapter_idx]-1:
                        end_paper_idx = next_chapter_idx-1
                    else:
                        end_paper_idx = next_idx - 1
                    chapter_info[chapter_idx].append((start_paper_idx, end_paper_idx))
                    complete=True

        for chapter in chapter_info:
            chapter_title = remain_content[chapter][1]
            chapter_dir = paper_dir.joinpath(chapter_title)

            if not chapter_dir.exists():
                chapter_dir.mkdir(parents=True)

            for paper_range in chapter_info[chapter]:
                paper_inst = fitz.open()
                start_page, end_page = remain_content[paper_range[0]][2]-1, remain_content[paper_range[1]][2]-1
                extracted_keyword = extract_keyword(doc[start_page])

                paper_inst.insert_pdf(doc, from_page=start_page, to_page=end_page)

                paper_name = remain_content[paper_range[0]][1]
                clean_paper_name = re.sub(CLEAN_PTTN, '', paper_name)
                clean_paper_name = '_'.join(clean_paper_name.split())
                save_path=chapter_dir.joinpath(clean_paper_name).with_suffix('.pdf')
                paper_inst.save(save_path)
                paper_inst.close()

                statistic_info['original_title'].append(paper_name)
                statistic_info['paper_title'].append(clean_paper_name)
                statistic_info['chapter_title'].append(chapter_title)
                statistic_info['book_name'].append(book_path.stem)
                statistic_info['keyword'].append(extracted_keyword)
                statistic_info['save_path'].append(save_path.as_posix())

        statistic_info_df = pd.DataFrame.from_dict(statistic_info)
        statistic_info_df.to_csv(Path(output_dir).joinpath('report.csv'), index=False, sep='\t')

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_dir', default=None)
    parser.add_argument('--output_dir', default=None)
    args = parser.parse_args()
    split_pdf_book(book_dirs=args.input_dir, output_dir=args.output_dir)
	import re
	import fitz
	from fitz import Page
	import argparse
	import pandas as pd
	from pathlib import Path
	from collections import defaultdict


	EXCLUDE_KEYWORD = [
	'Contents',
	'Organization',
	'Preface',
	'Foreword',
	'Author Index'
	]
	PAPER_KEYWORD_PTTN = [
	'Keywords:'
	]

	CLEAN_PTTN = re.compile(r'[^0-9a-zA-Z]')


	def extract_keyword(first_page: Page):
	page_blocks = first_page.get_text("blocks")
	success=False
	new_lst_kw=[]
	for blk in page_blocks:
	for kw in PAPER_KEYWORD_PTTN:
	if blk[4].startswith(kw) and not success:
	lst_kw = blk[4].replace(kw, '')
	lst_kw = lst_kw.split('·')
	for k in lst_kw:
	k = k.strip().split('\n')
	if len(k) > 1:
	new_lst_kw += [item.strip() for item in k]
	else:
	new_lst_kw += [k[0].strip()]
	success=True
	if success:
	break

	if len(new_lst_kw) > 0:
	return ','.join(new_lst_kw)
	else:
	return 'Not found'


	def split_pdf_book(book_dirs, output_dir=None, pattern='*.pdf'):
	if Path(book_dirs).is_file():
	book_paths = [Path(book_dirs)]
	else:
	book_paths = list(Path(book_dirs).rglob(pattern))

	statistic_info = {
	'book_name': [],
	'chapter_title': [],
	'paper_title': [],
	'original_title':[],
	'keyword': [],
	'save_path': []
	}

	for book_path in book_paths:
	paper_dir = Path(output_dir).joinpath(book_path.stem)
	if not paper_dir.exists():
	paper_dir.mkdir(parents=True)

	doc = fitz.Document(book_path.as_posix())
	toc = doc.get_toc(simple=False)

	contents = [[int(t[0]), t[1].strip(), int(t[2])] for t in toc]

	remain_content = list(filter(lambda content: not any(content[1].__contains__(key) for key in EXCLUDE_KEYWORD), contents))
	remain_content = sorted(remain_content, key=lambda content: content[2])

	first_level_idx = [idx for idx, item in enumerate(remain_content) if item[0] == 1]

	chapter_idxs = []
	paper_idxs = []
	first_level_idx = first_level_idx + [len(remain_content)]
	rel_first_level_idx = dict([(idx, i) for idx, i in zip(first_level_idx, range(len(first_level_idx)))])

	for idx, next_idx in list(zip(first_level_idx[:-1], first_level_idx[1:])):
	if next_idx-idx == 1:
	chapter_idxs.append(idx)
	else:
	paper_idxs.append(idx)

	chapter_info = defaultdict(list)
	for idx in chapter_idxs:
	chapter_info[idx] = []

	chapter_idxs += [len(remain_content)]
	paper_idxs += [len(remain_content)]

	chpter_range = list(zip(chapter_idxs[:-1], chapter_idxs[1:]))
	paper_range = list(zip(paper_idxs[:-1], paper_idxs[1:]))

	for idx, next_idx in paper_range:
	complete=False
	for chapter_idx, next_chapter_idx in chpter_range:
	if idx > chapter_idx and idx < next_chapter_idx and not complete:
	start_paper_idx = idx
	if rel_first_level_idx[idx] == rel_first_level_idx[next_chapter_idx]-1:
	end_paper_idx = next_chapter_idx-1
	else:
	end_paper_idx = next_idx - 1
	chapter_info[chapter_idx].append((start_paper_idx, end_paper_idx))
	complete=True

	for chapter in chapter_info:
	chapter_title = remain_content[chapter][1]
	chapter_dir = paper_dir.joinpath(chapter_title)

	if not chapter_dir.exists():
	chapter_dir.mkdir(parents=True)

	for paper_range in chapter_info[chapter]:
	paper_inst = fitz.open()
	start_page, end_page = remain_content[paper_range[0]][2]-1, remain_content[paper_range[1]][2]-1
	extracted_keyword = extract_keyword(doc[start_page])

	paper_inst.insert_pdf(doc, from_page=start_page, to_page=end_page)

	paper_name = remain_content[paper_range[0]][1]
	clean_paper_name = re.sub(CLEAN_PTTN, '', paper_name)
	clean_paper_name = '_'.join(clean_paper_name.split())
	save_path=chapter_dir.joinpath(clean_paper_name).with_suffix('.pdf')
	paper_inst.save(save_path)
	paper_inst.close()

	statistic_info['original_title'].append(paper_name)
	statistic_info['paper_title'].append(clean_paper_name)
	statistic_info['chapter_title'].append(chapter_title)
	statistic_info['book_name'].append(book_path.stem)
	statistic_info['keyword'].append(extracted_keyword)
	statistic_info['save_path'].append(save_path.as_posix())

	statistic_info_df = pd.DataFrame.from_dict(statistic_info)
	statistic_info_df.to_csv(Path(output_dir).joinpath('report.csv'), index=False, sep='\t')

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--input_dir', default=None)
	parser.add_argument('--output_dir', default=None)
	args = parser.parse_args()
	split_pdf_book(book_dirs=args.input_dir, output_dir=args.output_dir)