Skip to content

Instantly share code, notes, and snippets.

@stivens13
Last active August 16, 2022 16:08
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stivens13/5fc95ea2585fdfa3897f45a2d478b06f to your computer and use it in GitHub Desktop.
Save stivens13/5fc95ea2585fdfa3897f45a2d478b06f to your computer and use it in GitHub Desktop.
Bing image search by Adrian Rosebrock @jrosebr1 from https://www.pyimagesearch.com/2018/04/09/how-to-quickly-build-a-deep-learning-image-dataset/ with gevent implementation for faster image capturing
from requests import exceptions
import requests
import cv2
import os
import gevent
# poke name to download
pokemon = 'mewtwo'
output = 'dataset/mewtwo'
API_KEY = "TYPE YOUR KEY HERE"
MAX_RESULTS = 250
GROUP_SIZE = 50
# set the endpoint API URL
URL = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
# when attempting to download images from the web both the Python
# programming language and the requests library have a number of
# exceptions that can be thrown so let's build a list of them now
# so we can filter on them
EXCEPTIONS = {IOError, FileNotFoundError, exceptions.RequestException, exceptions.HTTPError, exceptions.ConnectionError,
exceptions.Timeout}
# store the search term in a convenience variable then set the
# headers and search parameters
term = pokemon
headers = {"Ocp-Apim-Subscription-Key": API_KEY}
params = {"q": term, "offset": 0, "count": GROUP_SIZE}
# make the search
print("[INFO] searching Bing API for '{}'".format(term))
search = requests.get(URL, headers=headers, params=params)
search.raise_for_status()
# grab the results from the search, including the total number of
# estimated results returned by the Bing API
results = search.json()
estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS)
print("[INFO] {} total results for '{}'".format(estNumResults, term))
# initialize the total number of images downloaded thus far
total = 0
def grab_page(url, ext, total):
try:
# total += 1
print("[INFO] fetching: {}".format(url))
r = requests.get(url, timeout=30)
# build the path to the output image
#here total is only for filename creation
p = os.path.sep.join([output, "{}{}".format(
str(total).zfill(8), ext)])
# write the image to disk
f = open(p, "wb")
f.write(r.content)
f.close()
# try to load the image from disk
image = cv2.imread(p)
# if the image is `None` then we could not properly load the
# image from disk (so it should be ignored)
if image is None:
print("[INFO] deleting: {}".format(p))
os.remove(p)
return
# catch any errors that would not unable us to download the
# image
except Exception as e:
# check to see if our exception is in our list of
# exceptions to check for
if type(e) in EXCEPTIONS:
print("[INFO] skipping: {}".format(url))
return
# loop over the estimated number of results in `GROUP_SIZE` groups
for offset in range(0, estNumResults, GROUP_SIZE):
# update the search parameters using the current offset, then
# make the request to fetch the results
print("[INFO] making request for group {}-{} of {}...".format(
offset, offset + GROUP_SIZE, estNumResults))
params["offset"] = offset
search = requests.get(URL, headers=headers, params=params)
search.raise_for_status()
results = search.json()
print("[INFO] saving images for group {}-{} of {}...".format(
offset, offset + GROUP_SIZE, estNumResults))
# loop over the results
jobs = []
for v in results["value"]:
total += 1
ext = v["contentUrl"][v["contentUrl"].rfind("."):]
url = v["contentUrl"]
# create gevent job
jobs.append(gevent.spawn(grab_page, url, ext, total))
# wait for all jobs to complete
gevent.joinall(jobs, timeout=10)
print(total)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment