Created
May 12, 2018 00:12
-
-
Save benaneesh/4ffb5cc45922c1545ae3853486213d30 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ian London 2016 | |
# couldn't get anything I found to work... do it myself | |
# this downloads thumbnails from Google Images. | |
import time | |
from selenium import webdriver | |
from selenium.webdriver.common.action_chains import ActionChains | |
from selenium.webdriver.common.keys import Keys | |
import base64 | |
import re | |
import glob | |
import urllib | |
QUERY = 'panda' | |
DOWNLOAD_DIR = '%s_rip' % QUERY | |
target_url_str = "https://www.google.com/search?as_st=y&tbm=isch&hl=en&as_q=%s&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs=isz:m" % QUERY | |
image_xpath = "//img[@class='rg_i']" | |
driver = webdriver.Firefox() | |
driver.get(target_url_str) | |
# function to handle dynamic page content loading - using Selenium | |
# modified from http://sqa.stackexchange.com/questions/3499/how-to-scroll-to-bottom-of-page-in-selenium-ide | |
# (thanks Polyakoff) | |
def scroll_down(): | |
# define initial page height for 'while' loop | |
last_height = driver.execute_script("return document.body.scrollHeight") | |
while True: | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(8) | |
try: | |
driver.find_element_by_id('smb').click() | |
except: | |
print 'no See More button found...' | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
if new_height == last_height: | |
break | |
else: | |
last_height = new_height | |
def hover(el): | |
ActionChains(driver).move_to_element(el).perform() | |
def right_click_save_as(el): | |
ActionChains(driver).move_to_element(el) \ | |
.context_click(el) \ | |
.send_keys('V') \ | |
.perform() | |
def save_img_src(el, file_no, sleep_time=0.25): | |
base = el.get_attribute('src') | |
# just guess jpeg, probably no file ext in url... | |
file_name_full = '%s/%s.%s' % (DOWNLOAD_DIR, file_no, 'JPEG') | |
try: | |
urllib.urlretrieve(base, file_name_full) | |
print 'wrote from url %s' % file_name_full | |
except IOError as e: | |
print 'Bad URL?', e | |
time.sleep(sleep_time) | |
# Google image thumbnails are base64 html strings... | |
def dl_base64_img(el, file_no, sleep_time=0.25): | |
hover(el) | |
time.sleep(0.25) | |
base = el.get_attribute('src') | |
if not base: | |
print 'no img', file_no | |
return | |
base_clean = base[base.find(','):] | |
try: | |
base_filetype = re.findall(r'image/(.*);', base)[0] | |
except IndexError: | |
print 'no img filetype... trying to save src', file_no | |
save_img_src(el, file_no) | |
return | |
file_name_full = '%s/%s.%s' % (DOWNLOAD_DIR, file_no, base_filetype) | |
with open(file_name_full, 'w') as f: | |
f.write(base64.decodestring(base_clean)) | |
print 'wrote %s' % file_name_full | |
time.sleep(sleep_time) | |
if __name__ == "__main__": | |
# TODO: use command line args instead of hard-coded vars (eg for query) | |
# scroll down to load many images | |
scroll_down() | |
prev_file_no = 0 #on an aborted scrape, set this to the last file written + 1 | |
imgs = driver.find_elements_by_xpath(image_xpath) | |
# iterate thru all images, and when you're done, check to see if there are any more | |
# for some reason the first image_xpath returns only 100 images, | |
# so you have to keep doing find_elements_by_xpath again and again | |
while prev_file_no < 1000 and len(imgs) > prev_file_no: | |
scroll_down() | |
imgs = driver.find_elements_by_xpath(image_xpath) | |
print 'new loop. found %i images, prev_file_no was %i' % (len(imgs), prev_file_no) | |
for file_no, img_el in enumerate(imgs[prev_file_no:]): | |
dl_base64_img(img_el, file_no+prev_file_no) | |
prev_file_no = len(imgs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment