Skip to content

Instantly share code, notes, and snippets.

@drelatgithub
Last active May 11, 2024 04:01
Show Gist options
  • Save drelatgithub/6f2888c2109f29206786105f2d00db5e to your computer and use it in GitHub Desktop.
Save drelatgithub/6f2888c2109f29206786105f2d00db5e to your computer and use it in GitHub Desktop.
Docin document downloader
###############################################################################
#
# Docin document downloader
#
# Valid as of 2022-12-13
#
###############################################################################
import argparse
import os
from types import SimpleNamespace
import urllib.request
conf = SimpleNamespace(
docin_pid = 0,
output_dir = ""
)
def download_image(pid):
i = 0
while True:
i += 1
try:
urllib.request.urlretrieve(
"http://211.147.220.164/index.jsp?file={}&pageno={}".format(pid, i),
os.path.join(conf.output_dir, "{}.png".format(i))
)
except urllib.error.HTTPError:
break
else:
print("Page", i, "saved.")
if __name__ == "__main__":
# Parse the arguments
parser = argparse.ArgumentParser()
parser.add_argument("docin_pid", type=str, help="The number after \"p-\" in docin url")
parser.add_argument("output_dir", type=str, help="The output directory")
args = parser.parse_args()
conf.docin_pid = args.docin_pid
conf.output_dir = args.output_dir
# Do the work
download_image(conf.docin_pid)
@SyFeee
Copy link

SyFeee commented Mar 19, 2024

import os
import urllib.request
from PIL import Image

def download_images(pid, output_dir):
images = []
i = 1
while True:
try:
file_name = f"{i}.png"
file_path = os.path.join(output_dir, file_name)
url = f"http://211.147.220.164/index.jsp?file={pid}&pageno={i}"
urllib.request.urlretrieve(url, file_path)
print(f"Page {i} downloaded.")
images.append(file_path)
i += 1
except urllib.error.HTTPError:
print("Download completed or no more pages to fetch.")
break
return images

def images_to_pdf(images, output_dir):
pdf_path = os.path.join(output_dir, "document.pdf")
image_list = [Image.open(image).convert('RGB') for image in images]
if image_list:
image_list[0].save(pdf_path, save_all=True, append_images=image_list[1:])
print(f"PDF saved at {pdf_path}")

if name == "main":
pid = 2064621039 # Example PID, replace with the actual PID you want to use
output_dir = "/content/drive/MyDrive/Data" # Change to your desired output directory

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

images = download_images(pid, output_dir)
if images:
    images_to_pdf(images, output_dir)

Here is new code

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment