Simple script to rename a pdf document to its most likely title via openai, written by, you guessed, openai :)
You will need an openai key
I like to read and collect a lot of papers, but they almost always come with names like 0909.1788.pdf, so I rename each and every one.
The script works by taking a PDF file as input, extracting the text from the first page, sending this text to an LLM for title prediction, and then renaming the original PDF file to the predicted title.
First, make sure you have installed PyPDF2 and OpenAI packages:
pip install PyPDF2 openai
Next, create a file named rename.py with the following content:
import os, sys
import PyPDF2
import openai
# Replace with your OpenAI API key
openai.api_key = "your openai api key"
def extract_first_page_text(file_path):
with open(file_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
first_page = pdf_reader.pages[0]
text = first_page.extract_text()
return text
def get_title_from_llm(text):
response = openai.Completion.create(
engine="text-davinci-002",
prompt=f"What is the most likely title of this PDF document based on the following text? Reply only with the title\n\n{text}",
max_tokens=50,
n=1,
stop=None,
temperature=0.5,
)
title = response.choices[0].text.strip()
return title
def rename_pdf(file_path, new_title):
directory, file_name = os.path.split(file_path)
file_extension = os.path.splitext(file_name)[1]
new_file_name = f"{new_title}{file_extension}"
new_file_path = os.path.join(directory, new_file_name)
if not os.path.exists(new_file_path):
os.rename(file_path, new_file_path)
print(f"File renamed to: {new_file_path}")
else:
print(f"File with the same name '{new_file_name}' already exists in the directory.")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python pdf_rename.py <pdf_file>")
sys.exit(1)
pdf_file_path = sys.argv[1]
if not pdf_file_path.endswith(".pdf"):
print("Please provide a PDF file.")
sys.exit(1)
first_page_text = extract_first_page_text(pdf_file_path)
title = get_title_from_llm(first_page_text)
rename_pdf(pdf_file_path, title)
Replace <path_to_pdf_file> with the path to the PDF file you want to process.
python rename.py <path_to_pdf_file>