Skip to content

Instantly share code, notes, and snippets.

@cftang0827
Created July 19, 2020 10:39
Show Gist options
  • Save cftang0827/36caa8c594e02d06c8634c05fda97c7c to your computer and use it in GitHub Desktop.
Save cftang0827/36caa8c594e02d06c8634c05fda97c7c to your computer and use it in GitHub Desktop.
import os
from glob import glob
import subprocess as sb
def parse_html(filename):
print(filename.split("."[0] + ".txt"))
with open(os.path.join(".", "txt", filename.split(".")[0] + ".txt")) as f:
texts = f.readlines()
with open(os.path.join(".", "./html/{}_text.html".format(filename.split(".")[0])), "w") as f:
f.writelines("<html>\n<div>")
for t in texts:
f.writelines(t.replace("\n", "<br>"))
f.writelines("</div>\n</html>")
print("Generate txt folder")
if not os.path.isdir("txt"):
os.mkdir("txt")
print("Generate html folder")
if not os.path.isdir("html"):
os.mkdir("html")
print("Get all pdf files in this folder!")
all_pdfs = glob("*.pdf")
print("Overall {} pdf files.".format(len(all_pdfs)))
print("################################")
for file in all_pdfs:
print("Remove space in file name")
old_file = file
file.replace(" ", "")
os.rename(old_file, file)
print("Analyzing file {}".format(file))
sb.call(["ocrmypdf", file, "output.pdf", "--sidecar", "./txt/" + file.split(".")[0] + ".txt", "--force-ocr"])
print("Generate html file")
parse_html(file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment