Last active
April 14, 2017 12:13
-
-
Save Algogator/fab64a14fccee6e564ef08e7b2509568 to your computer and use it in GitHub Desktop.
Multi Threaded image scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import concurrent.futures | |
import requests | |
from PIL import Image | |
from io import BytesIO | |
import mimetypes | |
from urllib.parse import urlparse | |
from os.path import splitext | |
with open("res.json") as f: | |
data = json.load(f) | |
def download_url(title, url): | |
response = requests.get(url) | |
# total_length = response.headers.get('content-length') | |
file_type = response.headers['content-type'] | |
parsed = urlparse(url) | |
_, extension = splitext(parsed.path) | |
# Gives .jpe with .get() | |
# https://bugs.python.org/issue4963 | |
# extension = mimetypes.guess_extension(file_type) | |
# print(file_type, extension) | |
i = Image.open(BytesIO(response.content)) | |
i.save("./pics/" + title + extension) | |
# with open("./pics/" + title + ".jpg", "wb") as f: | |
# response = requests.get(url, stream=True) | |
# total_length = response.headers.get('content-length') | |
# print(total_length) | |
# if total_length is None: # no content length header | |
# f.write(response.content) | |
# else: | |
# dl = 0 | |
# total_length = int(total_length) | |
# for data in response.iter_content(chunk_size=4096): | |
# dl += len(data) | |
# f.write(data) | |
# done = int(50 * dl / total_length) | |
# sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) ) | |
# sys.stdout.flush() | |
return(title) | |
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: | |
future_to_url = [executor.submit(download_url, url["title"], url[ | |
"url"]) for url in data["res"]] | |
# submit(fn, *args, **kwargs) | |
# Schedules the callable, fn, to be executed as fn(*args **kwargs) and returns a Future object representing the execution of the callable. | |
for future in concurrent.futures.as_completed(future_to_url): | |
# concurrent.futures.as_completed(fs, timeout=None)¶ | |
# Returns an iterator over the Future instances (possibly created by different Executor instances) given by fs that yields futures as they complete (finished or were cancelled). Any futures given by fs that are duplicated will be returned once. Any futures that completed before as_completed() is called will be yielded first. | |
try: | |
data = future.result() | |
except Exception as exc: | |
print('Exception: %s' % (exc)) | |
else: | |
print("Finished - ", data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment