Skip to content

Instantly share code, notes, and snippets.

@RickyAvina
Created May 28, 2020 19:14
Show Gist options
  • Save RickyAvina/ceb7964691061475988f7ffd47bb9ac6 to your computer and use it in GitHub Desktop.
Save RickyAvina/ceb7964691061475988f7ffd47bb9ac6 to your computer and use it in GitHub Desktop.
Download Google Images
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import sys
import os
import requests
import urllib3
from urllib3.exceptions import InsecureRequestWarning
import time
from tqdm import trange
urllib3.disable_warnings(InsecureRequestWarning)
def download_google_staticimages(searchurl, dirs, chromedriver, headless=True):
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
if headless:
options.add_argument('--headless')
try:
browser = webdriver.Chrome(chromedriver, options=options)
except Exception as e:
print(f'No found chromedriver in this environment.')
print(f'Install on your machine. exception: {e}')
sys.exit()
browser.set_window_size(1280, 1024)
browser.get(searchurl)
time.sleep(1)
print(f'Getting you a lot of images. This may take a few moments...')
element = browser.find_element_by_tag_name('body')
# Scroll down
#for i in range(30):
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
try:
browser.find_element_by_id('smb').click()
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
except:
for i in range(10):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
print(f'Reached end of page.')
time.sleep(0.5)
print(f'Retry')
time.sleep(0.5)
# Below is in japanese "show more result" sentences. Change this word to your lanaguage if you require.
browser.find_element_by_xpath('//input[@value="Show more results"]').click()
# Scroll down 2
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
try:
browser.find_element_by_id('smb').click()
for i in range(50):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
except:
for i in range(10):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
#elements = browser.find_elements_by_xpath('//div[@id="islrg"]')
#page_source = elements[0].get_attribute('innerHTML')
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'lxml')
images = soup.find_all('img')
urls = []
for i in trange(len(images), desc='finding images'):
try:
url = images[i]['data-src']
if not url.find('https://'):
urls.append(url)
except:
try:
url = images[i]['src']
if not url.find('https://'):
urls.append(images[i]['src'])
except Exception as e:
print(f'No found image sources.')
print(e)
# count = 0
# if urls:
# for url in urls:
# try:
# res = requests.get(url, verify=False, stream=True)
# rawdata = res.raw.read()
# with open(os.path.join(dirs, 'img_' + str(count) + '.jpg'), 'wb') as f:
# f.write(rawdata)
# count += 1
# except Exception as e:
# print('Failed to write rawdata.')
# print(e)
count = 0
if urls:
for i in trange(len(urls), desc='saving images'):
try:
res = requests.get(urls[i], verify=False, stream=True)
rawdata = res.raw.read()
with open(os.path.join(dirs, 'img_' + str(count) + '.jpg'), 'wb') as f:
f.write(rawdata)
count += 1
except Exception as e:
print('Failed to write rawdata.')
print(e)
browser.close()
return count
def download_images(arguments):
"""
:param arguments: {search_words: ["hot dog", "potato"],
dir: "pictures",
max_count: 1000
chrome_driver = '/usr/local/custom_bin/chromedriver'
headless: True
}
search_words, dir, chrome_driver are necessary
:return: None
"""
search_url = 'https://www.google.com/search?q='
for word in arguments['search_words']:
search_url += word + "+"
search_url = search_url[:-1] + '&source=lnms&tbm=isch'
dirs = arguments['dir']
max_count = arguments.get('max_count', 1000)
headless = arguments.get('headless', True)
chrome_driver = arguments['chrome_driver'] # '/usr/local/custom_bin/chromedriver'
if not os.path.exists(dirs):
os.mkdir(dirs)
t0 = time.time()
count = download_google_staticimages(search_url, dirs, chrome_driver, headless)
t1 = time.time()
total_time = t1 - t0
print(f'\n')
print(f'Download completed. [Successful count = {count}].')
print(f'Total time is {str(total_time)} seconds.')
# if __name__ == '__main__':
# download_images({"search_words": ["Hot dog"],
# "dir": "pictures/hot_dog",
# "max_count": 1000,
# "chrome_driver": "/usr/local/custom_bin/chromedriver",
# "headless": False})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment