Skip to content

Instantly share code, notes, and snippets.

@KokoseiJ
Last active July 8, 2022 02:08
Show Gist options
  • Save KokoseiJ/e7ce7b636183eb3c2ec27e4b44afd2f1 to your computer and use it in GitHub Desktop.
Save KokoseiJ/e7ce7b636183eb3c2ec27e4b44afd2f1 to your computer and use it in GitHub Desktop.
Batch download files from archive.org collections
import re
import sys
import requests
import threading
import subprocess
from subprocess import DEVNULL
from urllib.parse import unquote, urljoin
def download_file(url, semaphore):
proc = subprocess.Popen(["wget", url], stderr=DEVNULL)
proc.wait()
print(f"[*] *** Finished downloading <{unquote(url)}>! ***")
semaphore.release()
if len(sys.argv) < 2:
print(f"Usage: {sys.executable} {sys.argv[0]} url [extension] [threads]")
sys.exit(1)
url = sys.argv[1]
if len(sys.argv) > 2:
ext = sys.argv[2]
else:
ext = "zip"
if len(sys.argv) > 3:
threads = int(sys.argv[3])
else:
threads = 5
print(f"[*] Downloading {url.rsplit('/', 1)[-1]}...\n")
r = requests.get(url)
zips = re.findall(f"href=\"(.*?\.{ext})\"", r.text)
# zips = sorted(zips, key=lambda x: "Korea" not in x)
print(f"[*] Searching extension: <{ext}>")
print(f"[*] Amount of files to download: {len(zips)}")
print("[*] Starting download now.\n")
print("================================================\n")
baseurl = f"{url}/" if not url.endswith("/") else url
semaphore = threading.BoundedSemaphore(threads)
for filename in zips:
semaphore.acquire()
print(f"[*] Downloading <{unquote(filename)}>...")
url = urljoin(baseurl, filename)
thread = threading.Thread(target=download_file, args=(url, semaphore))
thread.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment