Skip to content

Instantly share code, notes, and snippets.

@TestItOnlyOnce
Forked from hanjianwei/pdftitle.py
Last active March 23, 2022 03:30
Show Gist options
  • Save TestItOnlyOnce/3fd19d0dd112180d0b14465477f5d0b7 to your computer and use it in GitHub Desktop.
Save TestItOnlyOnce/3fd19d0dd112180d0b14465477f5d0b7 to your computer and use it in GitHub Desktop.
Rename pdf files to their content's titles (for Python 3; with wildcard support; default rename all pdfs)
#!/usr/bin/env python
"""
Extracts title from PDF files (Python 3).
Depends on: pdf, pyPDF2, PDFMiner3k, unidecode.
Usage:
pdftitle -d tmp --rename *.pdf{}
"""
from io import StringIO
import getopt, os, re, string, sys, glob, unidecode
from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError
from pdfminer.pdfparser import PDFParser, PDFDocument, PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTChar, LTText, LTFigure, LTTextBox, LTTextLine
__all__ = ['pdf_title']
def make_parsing_state(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
return type('ParsingState', (), enums)
def log(text):
if IS_LOG_ON:
print ('-------- ' + text)
CHAR_PARSING_STATE = make_parsing_state('INIT_X', 'INIT_D', 'INSIDE_WORD')
IS_LOG_ON = False
ONE_CLICK_MODE = True
MIN_CHARS = 6
MAX_WORDS = 20
MIN_LONGEST_WORD = 4
def max_word_length(text):
return max(len(w) for w in text.split(' '))
def sanitize(filename):
"""Turn string into a valid file name.
"""
# If the title was picked up from text, it may be too large.
# Preserve a certain number of words
words = filename.split(' ')
filename = ' '.join(words[0:MAX_WORDS])
# Preserve letters with diacritics
try:
filename = unidecode.unidecode(filename.encode('utf-8').decode('utf-8'))
except UnicodeDecodeError:
print("*** Skipping invalid title decoding***")
# Preserve subtitle separator
filename = re.sub(r':', ' -', filename)
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
return "".join([c for c in filename if c in valid_chars])
def meta_title(filename):
"""Title from pdf metadata.
"""
fp = open(filename, 'rb')
docinfo = PdfFileReader(fp).getDocumentInfo()
fp.close()
if docinfo is None:
return ""
return docinfo.title if docinfo.title else ""
def junk_line(line):
"""Judge if a line is not appropriate for a title.
"""
too_small = len(line.strip()) < MIN_CHARS
has_no_words = bool(re.search(r'^[0-9 \t-]+$|^(\(cid:[0-9 \t-]*\))+|^(abstract|unknown|title|untitled):?$', line.strip().lower()))
is_copyright_info = bool(re.search(r'technical\s+report|proceedings|preprint|to\s+appear|submission|(integrated|international).*conference|transactions\s+on|symposium\s+on|downloaded\s+from\s+http', line.lower()))
return too_small or has_no_words or is_copyright_info
def empty_str(s):
return len(s.strip()) == 0
def update_largest_text(line, size, largest_text):
log('update size :' + str(size))
log('largest_text size: ' + str(largest_text['size']))
if not empty_str(line):
if (size > largest_text['size']):
largest_text = {
'contents': line,
'size': size
}
# Title spans multiple lines
elif (size == largest_text['size']):
largest_text['contents'] = largest_text['contents'] + line
return largest_text
def extract_largest_text(obj, largest_text):
for child in obj:
if isinstance(child, LTTextLine):
log('lt_obj child line: ' + str(child))
for child2 in child:
if isinstance(child2, LTChar):
largest_text = update_largest_text(child.get_text(), child2.size, largest_text)
break
elif isinstance(child, LTChar):
largest_text = update_largest_text(obj.get_text(), child.size, largest_text)
break
return largest_text
def extract_figure_text(lt_obj, largest_text):
"""
Extract text contained in a `LTFigure`.
Since text is encoded in `LTChar` elements, we detect separate lines
by keeping track of changes in font size.
"""
text = ""
line = ""
size = 0
char_distance = 0
char_previous_x1 = 0
state = CHAR_PARSING_STATE.INIT_X
for child in lt_obj:
log('child: ' + str(child))
# Ignore other elements
if not isinstance (child, LTChar):
continue
char_size = child.size
char_text = child.get_text()
decoded_char_text = unidecode.unidecode(char_text.encode('utf-8').decode('utf-8'))
log('char: ' + str(char_size) + ' ' + str(decoded_char_text))
# A new line was detected
if char_size != size:
log('new line')
largest_text = update_largest_text(line, size, largest_text)
text += line + '\n'
line = char_text
size = char_size
char_previous_x1 = child.x1
state = CHAR_PARSING_STATE.INIT_D
# The same line
else:
# Spaces may not be present as `LTChar` elements,
# so we manually add them.
# NOTE: A word starting with lowercase can't be
# distinguished from the current word.
char_current_distance = abs(child.x0 - char_previous_x1)
log('char_current_distance: ' + str(char_current_distance))
log('char_distance: ' + str(char_distance))
log('state: ' + str(state))
# Initialization
if state == CHAR_PARSING_STATE.INIT_X:
char_previous_x1 = child.x1
state = CHAR_PARSING_STATE.INIT_D
elif state == CHAR_PARSING_STATE.INIT_D:
# Update distance only if no space is detected
if (char_distance > 0) and (char_current_distance < char_distance * 2.5):
char_distance = char_current_distance
if (char_distance < 0.1):
char_distance = 0.1
state = CHAR_PARSING_STATE.INSIDE_WORD
# If the x-position decreased, then it's a new line
if (state == CHAR_PARSING_STATE.INSIDE_WORD) and (child.x1 < char_previous_x1):
log('x-position decreased')
line += ' '
char_previous_x1 = child.x1
state = CHAR_PARSING_STATE.INIT_D
# Large enough distance: it's a space
elif (state == CHAR_PARSING_STATE.INSIDE_WORD) and (char_current_distance > char_distance * 8.5):
log('space detected')
log('char_current_distance: ' + str(char_current_distance))
log('char_distance: ' + str(char_distance))
line += ' '
char_previous_x1 = child.x1
# When larger distance is detected between chars, use it to
# improve our heuristic
elif (state == CHAR_PARSING_STATE.INSIDE_WORD) and (char_current_distance > char_distance) and (char_current_distance < char_distance * 2.5):
char_distance = char_current_distance
char_previous_x1 = child.x1
# Chars are sequential
else:
char_previous_x1 = child.x1
line += child.get_text()
return (largest_text, text)
def pdf_text(filename):
fp = open(filename, 'rb')
doc = PDFDocument(caching=False)
parser = PDFParser(fp)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
rsrcmgr = PDFResourceManager()
laparams = LAParams(all_texts=True)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
text = ""
largest_text = {
'contents': "",
'size': 0
}
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
log('lt_obj: ' + str(lt_obj))
if isinstance(lt_obj, (LTFigure, LTTextBox, LTTextLine)):
if isinstance(lt_obj, LTFigure):
(largest_text, figure_text) = extract_figure_text(lt_obj, largest_text)
text += figure_text
else:
largest_text = extract_largest_text(lt_obj, largest_text)
text += lt_obj.get_text() + '\n'
# Only parse the first page
fp.close();
return (largest_text, text)
def title_start(lines):
for i, line in enumerate(lines):
if not empty_str(line) and not junk_line(line):
return i
return 0
def title_end(lines, start, max_lines=2):
for i, line in enumerate(lines[start+1:start+max_lines+1], start+1):
if empty_str(line):
return i
return start + 1
def text_title(filename):
"""Extract title from PDF's text.
"""
(largest_text, lines_joined) = pdf_text(filename)
lines = lines_joined.strip().split('\n')
if empty_str(largest_text['contents']):
i = title_start(lines)
j = title_end(lines, i)
text = ' '.join(line.strip() for line in lines[i:j])
else:
text = largest_text['contents'].strip()
# Strip dots, which conflict with os.path's splittext()
text = re.sub(r'\.', '', text)
return text
def valid_title(title):
return not empty_str(title) and max_word_length(title) > MIN_LONGEST_WORD and not junk_line(title) and empty_str(os.path.splitext(title)[1])
def pdf_title(filename):
"""Extract title using one of multiple strategies.
"""
title = ""
try:
title = meta_title(filename)
if valid_title(title):
return title
except:
print("*** Skipping invalid metadata! ***")
try:
title = text_title(filename)
if valid_title(title):
return title
except:
print("*** Skipping invalid parsing! ***")
if valid_title(title):
return title
return os.path.basename(os.path.splitext(filename)[0])
def process_file(directory, filename, rename, dry_run):
title = pdf_title(filename)
title = sanitize(' '.join(title.split()))
if rename:
new_name = os.path.join(directory, title + ".pdf")
print ("%s => %s" % (filename, new_name))
if not dry_run:
if os.path.exists(new_name):
print ("*** Target %s already exists! ***" % new_name)
else:
stat = os.stat(filename)
os.rename(filename, new_name)
os.utime(new_name, (stat.st_atime, stat.st_mtime))
else:
print ("%s => %s" % (filename, title))
return 0
def path_leaf(path):
head, tail = os.path.split(path)
return tail or os.path.basename(head)
if __name__ == "__main__":
opts, args = getopt.getopt(sys.argv[1:], 'nd:', ['dry-run', 'rename'])
dry_run = False
rename = False
target_dir = "."
for opt, arg in opts:
if opt in ['-n', '--dry-run']:
dry_run = True
elif opt in ['-r', '--rename']:
rename = True
elif opt in ['-d']:
target_dir = arg
if len(args) == 0:
print ("Usage: %s [-d output] [--dry-run] [--rename] [filenames]\n" % path_leaf(sys.argv[0]))
if ONE_CLICK_MODE:
args = ['*.pdf']
rename = True
else:
sys.exit(1)
for filename in args:
if "*" in filename:
for filenameexpanded in glob.glob(filename):
process_file(target_dir, filenameexpanded, rename, dry_run)
else:
process_file(target_dir, filename, rename, dry_run)
@AtomicNess123
Copy link

How does this work, as a function? What is "temp" in pdftitle -d tmp --rename *.pdf{}? Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment