Created
May 28, 2020 19:14
-
-
Save RickyAvina/ceb7964691061475988f7ffd47bb9ac6 to your computer and use it in GitHub Desktop.
Download Google Images
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from bs4 import BeautifulSoup | |
import sys | |
import os | |
import requests | |
import urllib3 | |
from urllib3.exceptions import InsecureRequestWarning | |
import time | |
from tqdm import trange | |
urllib3.disable_warnings(InsecureRequestWarning) | |
def download_google_staticimages(searchurl, dirs, chromedriver, headless=True): | |
options = webdriver.ChromeOptions() | |
options.add_argument('--no-sandbox') | |
if headless: | |
options.add_argument('--headless') | |
try: | |
browser = webdriver.Chrome(chromedriver, options=options) | |
except Exception as e: | |
print(f'No found chromedriver in this environment.') | |
print(f'Install on your machine. exception: {e}') | |
sys.exit() | |
browser.set_window_size(1280, 1024) | |
browser.get(searchurl) | |
time.sleep(1) | |
print(f'Getting you a lot of images. This may take a few moments...') | |
element = browser.find_element_by_tag_name('body') | |
# Scroll down | |
#for i in range(30): | |
for i in range(50): | |
element.send_keys(Keys.PAGE_DOWN) | |
time.sleep(0.3) | |
try: | |
browser.find_element_by_id('smb').click() | |
for i in range(50): | |
element.send_keys(Keys.PAGE_DOWN) | |
time.sleep(0.3) | |
except: | |
for i in range(10): | |
element.send_keys(Keys.PAGE_DOWN) | |
time.sleep(0.3) | |
print(f'Reached end of page.') | |
time.sleep(0.5) | |
print(f'Retry') | |
time.sleep(0.5) | |
# Below is in japanese "show more result" sentences. Change this word to your lanaguage if you require. | |
browser.find_element_by_xpath('//input[@value="Show more results"]').click() | |
# Scroll down 2 | |
for i in range(50): | |
element.send_keys(Keys.PAGE_DOWN) | |
time.sleep(0.3) | |
try: | |
browser.find_element_by_id('smb').click() | |
for i in range(50): | |
element.send_keys(Keys.PAGE_DOWN) | |
time.sleep(0.3) | |
except: | |
for i in range(10): | |
element.send_keys(Keys.PAGE_DOWN) | |
time.sleep(0.3) | |
#elements = browser.find_elements_by_xpath('//div[@id="islrg"]') | |
#page_source = elements[0].get_attribute('innerHTML') | |
page_source = browser.page_source | |
soup = BeautifulSoup(page_source, 'lxml') | |
images = soup.find_all('img') | |
urls = [] | |
for i in trange(len(images), desc='finding images'): | |
try: | |
url = images[i]['data-src'] | |
if not url.find('https://'): | |
urls.append(url) | |
except: | |
try: | |
url = images[i]['src'] | |
if not url.find('https://'): | |
urls.append(images[i]['src']) | |
except Exception as e: | |
print(f'No found image sources.') | |
print(e) | |
# count = 0 | |
# if urls: | |
# for url in urls: | |
# try: | |
# res = requests.get(url, verify=False, stream=True) | |
# rawdata = res.raw.read() | |
# with open(os.path.join(dirs, 'img_' + str(count) + '.jpg'), 'wb') as f: | |
# f.write(rawdata) | |
# count += 1 | |
# except Exception as e: | |
# print('Failed to write rawdata.') | |
# print(e) | |
count = 0 | |
if urls: | |
for i in trange(len(urls), desc='saving images'): | |
try: | |
res = requests.get(urls[i], verify=False, stream=True) | |
rawdata = res.raw.read() | |
with open(os.path.join(dirs, 'img_' + str(count) + '.jpg'), 'wb') as f: | |
f.write(rawdata) | |
count += 1 | |
except Exception as e: | |
print('Failed to write rawdata.') | |
print(e) | |
browser.close() | |
return count | |
def download_images(arguments): | |
""" | |
:param arguments: {search_words: ["hot dog", "potato"], | |
dir: "pictures", | |
max_count: 1000 | |
chrome_driver = '/usr/local/custom_bin/chromedriver' | |
headless: True | |
} | |
search_words, dir, chrome_driver are necessary | |
:return: None | |
""" | |
search_url = 'https://www.google.com/search?q=' | |
for word in arguments['search_words']: | |
search_url += word + "+" | |
search_url = search_url[:-1] + '&source=lnms&tbm=isch' | |
dirs = arguments['dir'] | |
max_count = arguments.get('max_count', 1000) | |
headless = arguments.get('headless', True) | |
chrome_driver = arguments['chrome_driver'] # '/usr/local/custom_bin/chromedriver' | |
if not os.path.exists(dirs): | |
os.mkdir(dirs) | |
t0 = time.time() | |
count = download_google_staticimages(search_url, dirs, chrome_driver, headless) | |
t1 = time.time() | |
total_time = t1 - t0 | |
print(f'\n') | |
print(f'Download completed. [Successful count = {count}].') | |
print(f'Total time is {str(total_time)} seconds.') | |
# if __name__ == '__main__': | |
# download_images({"search_words": ["Hot dog"], | |
# "dir": "pictures/hot_dog", | |
# "max_count": 1000, | |
# "chrome_driver": "/usr/local/custom_bin/chromedriver", | |
# "headless": False}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment