-
-
Save drelatgithub/6f2888c2109f29206786105f2d00db5e to your computer and use it in GitHub Desktop.
############################################################################### | |
# | |
# Docin document downloader | |
# | |
# Valid as of 2022-12-13 | |
# | |
############################################################################### | |
import argparse | |
import os | |
from types import SimpleNamespace | |
import urllib.request | |
conf = SimpleNamespace( | |
docin_pid = 0, | |
output_dir = "" | |
) | |
def download_image(pid): | |
i = 0 | |
while True: | |
i += 1 | |
try: | |
urllib.request.urlretrieve( | |
"http://211.147.220.164/index.jsp?file={}&pageno={}".format(pid, i), | |
os.path.join(conf.output_dir, "{}.png".format(i)) | |
) | |
except urllib.error.HTTPError: | |
break | |
else: | |
print("Page", i, "saved.") | |
if __name__ == "__main__": | |
# Parse the arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument("docin_pid", type=str, help="The number after \"p-\" in docin url") | |
parser.add_argument("output_dir", type=str, help="The output directory") | |
args = parser.parse_args() | |
conf.docin_pid = args.docin_pid | |
conf.output_dir = args.output_dir | |
# Do the work | |
download_image(conf.docin_pid) |
import os
import urllib.request
from PIL import Image
def download_images(pid, output_dir):
images = []
i = 1
while True:
try:
file_name = f"{i}.png"
file_path = os.path.join(output_dir, file_name)
url = f"http://211.147.220.164/index.jsp?file={pid}&pageno={i}"
urllib.request.urlretrieve(url, file_path)
print(f"Page {i} downloaded.")
images.append(file_path)
i += 1
except urllib.error.HTTPError:
print("Download completed or no more pages to fetch.")
break
return images
def images_to_pdf(images, output_dir):
pdf_path = os.path.join(output_dir, "document.pdf")
image_list = [Image.open(image).convert('RGB') for image in images]
if image_list:
image_list[0].save(pdf_path, save_all=True, append_images=image_list[1:])
print(f"PDF saved at {pdf_path}")
if name == "main":
pid = 2064621039 # Example PID, replace with the actual PID you want to use
output_dir = "/content/drive/MyDrive/Data" # Change to your desired output directory
if not os.path.exists(output_dir):
os.makedirs(output_dir)
images = download_images(pid, output_dir)
if images:
images_to_pdf(images, output_dir)
Here is new code
Works in 2022, thanks!
(add &width=1836&height=2376 to request for better resolution images)