Skip to content

Instantly share code, notes, and snippets.

@sborquez
Created August 15, 2019 07:08
Show Gist options
  • Save sborquez/3f0ccc93d6c4338f1b9ba3ac37a4fcee to your computer and use it in GitHub Desktop.
Save sborquez/3f0ccc93d6c4338f1b9ba3ac37a4fcee to your computer and use it in GitHub Desktop.
Create dataset from Google Images
"""
Use this script to download the images from the url.txt, then is necesary to manual prunning irrelevant images from our dataset
"""
# import the necessary packages
from imutils import paths
import argparse
import requests
import cv2
import os
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-u", "--urls", required=True,
help="path to file containing image URLs")
ap.add_argument("-o", "--output", required=True,
help="path to output directory of images")
args = vars(ap.parse_args())
# grab the list of URLs from the input file, then initialize the
# total number of images downloaded thus far
rows = open(args["urls"]).read().strip().split("\n")
total = 0
# loop the URLs
for url in rows:
try:
# try to download the image
r = requests.get(url, timeout=60)
# save the image to disk
p = os.path.sep.join([args["output"], "{}.jpg".format(
str(total).zfill(8))])
f = open(p, "wb")
f.write(r.content)
f.close()
# update the counter
print("[INFO] downloaded: {}".format(p))
total += 1
# handle if any exceptions are thrown during the download process
except:
print("[INFO] error downloading {}...skipping".format(p))
# loop over the image paths we just downloaded
for imagePath in paths.list_images(args["output"]):
# initialize if the image should be deleted or not
delete = False
# try to load the image
try:
image = cv2.imread(imagePath)
# if the image is `None` then we could not properly load it
# from disk, so delete it
if image is None:
delete = True
# if OpenCV cannot load the image then the image is likely
# corrupt so we should delete it
except:
print("Except")
delete = True
# check to see if the image should be deleted
if delete:
print("[INFO] deleting {}".format(imagePath))
os.remove(imagePath)
/*
from https://www.pyimagesearch.com/2017/12/04/how-to-create-a-deep-learning-dataset-using-google-images/
You need to get the urls of the images to download. To do that, first search in google Images something. Then
continue scrolling down until you find that there are no relevants images. Next, must run this snippet in your
browser console. After executing the above snippet you’ll have a file named urls.txt in your default Downloads
directory.
*/
// pull down jquery into the JavaScript console
var script = document.createElement('script');
script.src = "https://ajax.googleapis.com/ajax/libs/jquery/2.2.0/jquery.min.js";
document.getElementsByTagName('head')[0].appendChild(script);
// grab the URLs
var urls = $('.rg_di .rg_meta').map(function() { return JSON.parse($(this).text()).ou; });
// write the URls to file (one per line)
var textToSave = urls.toArray().join('\n');
var hiddenElement = document.createElement('a');
hiddenElement.href = 'data:attachment/text,' + encodeURI(textToSave);
hiddenElement.target = '_blank';
hiddenElement.download = 'urls.txt';
hiddenElement.click();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment