Skip to content

Instantly share code, notes, and snippets.

@cjudd
Last active December 24, 2022 16:05
Show Gist options
  • Save cjudd/ed9deeb6af293e1bd2d8acebb6380cec to your computer and use it in GitHub Desktop.
Save cjudd/ed9deeb6af293e1bd2d8acebb6380cec to your computer and use it in GitHub Desktop.
I have lots of PDF books and wanted to create a simple index that I can search. So this script generates a Markdown file containing filename, title, metadata and outline if it is available.
from pathlib import Path
from mdutils.mdutils import MdUtils
from mdutils import Html
from PyPDF2 import PdfReader
mdFile = MdUtils(file_name='books', title='My PDF Books')
def outline_list(outline):
result = []
for heading in outline:
# print(type(heading))
if type(heading) is not list:
result.append(heading.title)
else:
result.append(outline_list(heading))
return result
for path in Path('.').rglob('*.pdf'):
filename = str(path.parent) + "/" + path.name
reader = PdfReader(filename)
mdFile.new_header(level=1, title=path.name)
if not reader.is_encrypted:
meta = reader.metadata
# print(path.name)
if meta is not None:
mdFile.new_header(level=2, title='Meta Data')
mdFile.new_paragraph('''Filename: {filename}
Title: {title}
Author(s): {authors}
Creator: {creator}
Subject: {subject}
Page Count: {pages}
'''.format(filename=filename,
title=meta.title,
authors=meta.author,
creator=meta.creator,
subject=meta.subject,
pages=len(reader.pages)))
else:
mdFile.new_paragraph('''Filename: {filename}
**No metadata**'''.format(filename=filename))
try:
mdFile.new_header(level=2, title='Outline')
outline = outline_list(reader.outline)
mdFile.new_list(outline)
except:
mdFile.new_paragraph('Invalid outline')
else:
mdFile.new_paragraph('''Filename: {filename}
**Encrypted**'''.format(filename=filename))
mdFile.new_table_of_contents(table_title='Contents', depth=1)
mdFile.create_md_file()
print('Done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment