cftang0827/batch-transform-pdf.py

## batch-transform-pdf.py
import os
from glob import glob
import subprocess as sb

def parse_html(filename):
    print(filename.split("."[0] + ".txt"))
    with open(os.path.join(".", "txt", filename.split(".")[0] + ".txt")) as f:
        texts = f.readlines()

    with open(os.path.join(".", "./html/{}_text.html".format(filename.split(".")[0])), "w") as f:

        f.writelines("<html>\n<div>")
        for t in texts:
            f.writelines(t.replace("\n", "<br>"))
        f.writelines("</div>\n</html>")

print("Generate txt folder")
if not os.path.isdir("txt"):
    os.mkdir("txt")

print("Generate html folder")
if not os.path.isdir("html"):
    os.mkdir("html")

print("Get all pdf files in this folder!")
all_pdfs = glob("*.pdf")
print("Overall {} pdf files.".format(len(all_pdfs)))

print("################################")
for file in all_pdfs:
    print("Remove space in file name")
    old_file = file
    file.replace(" ", "")
    os.rename(old_file, file)
    print("Analyzing file {}".format(file))
    sb.call(["ocrmypdf", file, "output.pdf", "--sidecar", "./txt/" + file.split(".")[0] + ".txt", "--force-ocr"])
    print("Generate html file")
    parse_html(file)
	import os
	from glob import glob
	import subprocess as sb

	def parse_html(filename):
	print(filename.split("."[0] + ".txt"))
	with open(os.path.join(".", "txt", filename.split(".")[0] + ".txt")) as f:
	texts = f.readlines()

	with open(os.path.join(".", "./html/{}_text.html".format(filename.split(".")[0])), "w") as f:

	f.writelines("<html>\n<div>")
	for t in texts:
	f.writelines(t.replace("\n", "<br>"))
	f.writelines("</div>\n</html>")

	print("Generate txt folder")
	if not os.path.isdir("txt"):
	os.mkdir("txt")

	print("Generate html folder")
	if not os.path.isdir("html"):
	os.mkdir("html")

	print("Get all pdf files in this folder!")
	all_pdfs = glob("*.pdf")
	print("Overall {} pdf files.".format(len(all_pdfs)))

	print("################################")
	for file in all_pdfs:
	print("Remove space in file name")
	old_file = file
	file.replace(" ", "")
	os.rename(old_file, file)
	print("Analyzing file {}".format(file))
	sb.call(["ocrmypdf", file, "output.pdf", "--sidecar", "./txt/" + file.split(".")[0] + ".txt", "--force-ocr"])
	print("Generate html file")
	parse_html(file)