Created
December 3, 2023 19:16
-
-
Save techtide/f576ae963d8d6c3308eddbd3fa5e7e0a to your computer and use it in GitHub Desktop.
Bulk converter from PDF to Markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import subprocess | |
from PyPDF2 import PdfReader | |
def convert_pdf_to_markdown(pdf_path, output_dir): | |
with open(pdf_path, 'rb') as pdf_file: | |
pdf_reader = PdfReader(pdf_file) | |
text_content = "" | |
for page_num in range(len(pdf_reader.pages)): | |
text_content += pdf_reader.pages[page_num].extract_text() | |
temp_text_file = os.path.join(output_dir, 'temp_text.txt') | |
with open(temp_text_file, 'w', encoding='utf-8') as temp_file: | |
temp_file.write(text_content) | |
output_md_file = os.path.join(output_dir, os.path.splitext(os.path.basename(pdf_path))[0] + '.md') | |
subprocess.run(["pandoc", temp_text_file, "-o", output_md_file]) | |
os.remove(temp_text_file) | |
if __name__ == "__main__": | |
input_directory = "PDFs" | |
output_directory = "MDs" | |
if not os.path.exists(output_directory): | |
os.makedirs(output_directory) | |
for filename in os.listdir(input_directory): | |
if filename.endswith(".pdf"): | |
input_path = os.path.join(input_directory, filename) | |
convert_pdf_to_markdown(input_path, output_directory) | |
print(f"Conversion complete for {filename}") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment