Skip to content

Instantly share code, notes, and snippets.

@duylebkHCM
Last active December 8, 2023 16:27
Show Gist options
  • Save duylebkHCM/9a974a05e520b6a3c89861af24d690a0 to your computer and use it in GitHub Desktop.
Save duylebkHCM/9a974a05e520b6a3c89861af24d690a0 to your computer and use it in GitHub Desktop.
Automatically split ICDAR proceedings into separated papers
import re
import fitz
from fitz import Page
import argparse
import pandas as pd
from pathlib import Path
from collections import defaultdict
EXCLUDE_KEYWORD = [
'Contents',
'Organization',
'Preface',
'Foreword',
'Author Index'
]
PAPER_KEYWORD_PTTN = [
'Keywords:'
]
CLEAN_PTTN = re.compile(r'[^0-9a-zA-Z]')
def extract_keyword(first_page: Page):
page_blocks = first_page.get_text("blocks")
success=False
new_lst_kw=[]
for blk in page_blocks:
for kw in PAPER_KEYWORD_PTTN:
if blk[4].startswith(kw) and not success:
lst_kw = blk[4].replace(kw, '')
lst_kw = lst_kw.split('·')
for k in lst_kw:
k = k.strip().split('\n')
if len(k) > 1:
new_lst_kw += [item.strip() for item in k]
else:
new_lst_kw += [k[0].strip()]
success=True
if success:
break
if len(new_lst_kw) > 0:
return ','.join(new_lst_kw)
else:
return 'Not found'
def split_pdf_book(book_dirs, output_dir=None, pattern='*.pdf'):
if Path(book_dirs).is_file():
book_paths = [Path(book_dirs)]
else:
book_paths = list(Path(book_dirs).rglob(pattern))
statistic_info = {
'book_name': [],
'chapter_title': [],
'paper_title': [],
'original_title':[],
'keyword': [],
'save_path': []
}
for book_path in book_paths:
paper_dir = Path(output_dir).joinpath(book_path.stem)
if not paper_dir.exists():
paper_dir.mkdir(parents=True)
doc = fitz.Document(book_path.as_posix())
toc = doc.get_toc(simple=False)
contents = [[int(t[0]), t[1].strip(), int(t[2])] for t in toc]
remain_content = list(filter(lambda content: not any(content[1].__contains__(key) for key in EXCLUDE_KEYWORD), contents))
remain_content = sorted(remain_content, key=lambda content: content[2])
first_level_idx = [idx for idx, item in enumerate(remain_content) if item[0] == 1]
chapter_idxs = []
paper_idxs = []
first_level_idx = first_level_idx + [len(remain_content)]
rel_first_level_idx = dict([(idx, i) for idx, i in zip(first_level_idx, range(len(first_level_idx)))])
for idx, next_idx in list(zip(first_level_idx[:-1], first_level_idx[1:])):
if next_idx-idx == 1:
chapter_idxs.append(idx)
else:
paper_idxs.append(idx)
chapter_info = defaultdict(list)
for idx in chapter_idxs:
chapter_info[idx] = []
chapter_idxs += [len(remain_content)]
paper_idxs += [len(remain_content)]
chpter_range = list(zip(chapter_idxs[:-1], chapter_idxs[1:]))
paper_range = list(zip(paper_idxs[:-1], paper_idxs[1:]))
for idx, next_idx in paper_range:
complete=False
for chapter_idx, next_chapter_idx in chpter_range:
if idx > chapter_idx and idx < next_chapter_idx and not complete:
start_paper_idx = idx
if rel_first_level_idx[idx] == rel_first_level_idx[next_chapter_idx]-1:
end_paper_idx = next_chapter_idx-1
else:
end_paper_idx = next_idx - 1
chapter_info[chapter_idx].append((start_paper_idx, end_paper_idx))
complete=True
for chapter in chapter_info:
chapter_title = remain_content[chapter][1]
chapter_dir = paper_dir.joinpath(chapter_title)
if not chapter_dir.exists():
chapter_dir.mkdir(parents=True)
for paper_range in chapter_info[chapter]:
paper_inst = fitz.open()
start_page, end_page = remain_content[paper_range[0]][2]-1, remain_content[paper_range[1]][2]-1
extracted_keyword = extract_keyword(doc[start_page])
paper_inst.insert_pdf(doc, from_page=start_page, to_page=end_page)
paper_name = remain_content[paper_range[0]][1]
clean_paper_name = re.sub(CLEAN_PTTN, '', paper_name)
clean_paper_name = '_'.join(clean_paper_name.split())
save_path=chapter_dir.joinpath(clean_paper_name).with_suffix('.pdf')
paper_inst.save(save_path)
paper_inst.close()
statistic_info['original_title'].append(paper_name)
statistic_info['paper_title'].append(clean_paper_name)
statistic_info['chapter_title'].append(chapter_title)
statistic_info['book_name'].append(book_path.stem)
statistic_info['keyword'].append(extracted_keyword)
statistic_info['save_path'].append(save_path.as_posix())
statistic_info_df = pd.DataFrame.from_dict(statistic_info)
statistic_info_df.to_csv(Path(output_dir).joinpath('report.csv'), index=False, sep='\t')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--input_dir', default=None)
parser.add_argument('--output_dir', default=None)
args = parser.parse_args()
split_pdf_book(book_dirs=args.input_dir, output_dir=args.output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment