Skip to content

Instantly share code, notes, and snippets.

@techtide
Created December 3, 2023 19:16
Show Gist options
  • Save techtide/f576ae963d8d6c3308eddbd3fa5e7e0a to your computer and use it in GitHub Desktop.
Save techtide/f576ae963d8d6c3308eddbd3fa5e7e0a to your computer and use it in GitHub Desktop.
Bulk converter from PDF to Markdown
import os
import subprocess
from PyPDF2 import PdfReader
def convert_pdf_to_markdown(pdf_path, output_dir):
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PdfReader(pdf_file)
text_content = ""
for page_num in range(len(pdf_reader.pages)):
text_content += pdf_reader.pages[page_num].extract_text()
temp_text_file = os.path.join(output_dir, 'temp_text.txt')
with open(temp_text_file, 'w', encoding='utf-8') as temp_file:
temp_file.write(text_content)
output_md_file = os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.md')
subprocess.run(["pandoc", temp_text_file, "-o", output_md_file])
os.remove(temp_text_file)
if __name__ == "__main__":
input_directory = "PDFs"
output_directory = "MDs"
if not os.path.exists(output_directory):
os.makedirs(output_directory)
for filename in os.listdir(input_directory):
if filename.endswith(".pdf"):
input_path = os.path.join(input_directory, filename)
convert_pdf_to_markdown(input_path, output_directory)
print(f"Conversion complete for {filename}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment