Skip to content

Instantly share code, notes, and snippets.

@Algogator
Last active April 14, 2017 12:13
Show Gist options
  • Save Algogator/fab64a14fccee6e564ef08e7b2509568 to your computer and use it in GitHub Desktop.
Save Algogator/fab64a14fccee6e564ef08e7b2509568 to your computer and use it in GitHub Desktop.
Multi Threaded image scraper
import json
import concurrent.futures
import requests
from PIL import Image
from io import BytesIO
import mimetypes
from urllib.parse import urlparse
from os.path import splitext
with open("res.json") as f:
data = json.load(f)
def download_url(title, url):
response = requests.get(url)
# total_length = response.headers.get('content-length')
file_type = response.headers['content-type']
parsed = urlparse(url)
_, extension = splitext(parsed.path)
# Gives .jpe with .get()
# https://bugs.python.org/issue4963
# extension = mimetypes.guess_extension(file_type)
# print(file_type, extension)
i = Image.open(BytesIO(response.content))
i.save("./pics/" + title + extension)
# with open("./pics/" + title + ".jpg", "wb") as f:
# response = requests.get(url, stream=True)
# total_length = response.headers.get('content-length')
# print(total_length)
# if total_length is None: # no content length header
# f.write(response.content)
# else:
# dl = 0
# total_length = int(total_length)
# for data in response.iter_content(chunk_size=4096):
# dl += len(data)
# f.write(data)
# done = int(50 * dl / total_length)
# sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
# sys.stdout.flush()
return(title)
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = [executor.submit(download_url, url["title"], url[
"url"]) for url in data["res"]]
# submit(fn, *args, **kwargs)
# Schedules the callable, fn, to be executed as fn(*args **kwargs) and returns a Future object representing the execution of the callable.
for future in concurrent.futures.as_completed(future_to_url):
# concurrent.futures.as_completed(fs, timeout=None)¶
# Returns an iterator over the Future instances (possibly created by different Executor instances) given by fs that yields futures as they complete (finished or were cancelled). Any futures given by fs that are duplicated will be returned once. Any futures that completed before as_completed() is called will be yielded first.
try:
data = future.result()
except Exception as exc:
print('Exception: %s' % (exc))
else:
print("Finished - ", data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment