Skip to content

Instantly share code, notes, and snippets.

@randymi01
Created May 3, 2023 07:30
Show Gist options
  • Save randymi01/4ecad958c7a9d71ce29dbd611e8956fa to your computer and use it in GitHub Desktop.
Save randymi01/4ecad958c7a9d71ce29dbd611e8956fa to your computer and use it in GitHub Desktop.
Python multithread download images from site
#!/usr/bin/env python
# coding: utf-8
import requests as re
from bs4 import BeautifulSoup
import os
import time
from multiprocessing import cpu_count
from multiprocessing.pool import ThreadPool
import sys
# usage
# py main.py relative_path_destination_folder url
folder = sys.argv[1]
link = sys.argv[2]
response = re.get(link)
if not os.path.isdir(folder):
os.mkdir(folder)
pics = []
def image_criteria(href):
# add additional criteria here
return True
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
href = link.get("href")
# criteria for proper image links
if href.endswith(".jpg") and image_criteria(href):
pics.append(href)
# multithreaded download
def download(urls):
destinations = [folder+"/"+str(j)+".jpg" for i,j in zip(urls, range(1,len(urls)+1))]
download_parallel(zip(urls,destinations))
def download_url(args):
t0 = time.time()
url, fn = args[0], args[1]
try:
r = re.get(url)
with open(fn, 'wb') as f:
f.write(r.content)
return(url, time.time() - t0)
except Exception as e:
print('Exception in download_url():', e)
def download_parallel(args):
cpus = cpu_count()
results = ThreadPool(cpus - 1).imap_unordered(download_url, args)
for result in results:
print('url:', result[0], 'time (s):', result[1])
if __name__ == "__main__":
download(pics)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment