Created
November 8, 2021 01:42
-
-
Save Leumastai/0b89d83ba7bd84388c774ee15b144f1a to your computer and use it in GitHub Desktop.
Script to scrape images from google images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Uncomment the code below if running on Google Colab | |
""" %%capture | |
import sys | |
!pip install selenium | |
#!apt-get update # to update ubuntu to correctly run apt install | |
!apt install chromium-chromedriver | |
!cp /usr/lib/chromium-browser/chromedriver /usr/bin | |
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver') """ | |
import requests | |
from bs4 import BeautifulSoup | |
import lxml | |
import re | |
import os | |
from PIL import Image | |
import sys | |
import urllib.request, urllib.parse, urllib.error | |
import ssl | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
import time | |
driver = webdriver.Chrome( | |
'/home/samuel/Downloads/Compressed/chromedriver') | |
driver.get("https://www.google.com/") | |
box = driver.find_element_by_xpath('//*[@id="sb_form_q"]') | |
#Change cranes to the object you wish to search for | |
box.send_keys("cranes") | |
box.send_keys(Keys.ENTER) | |
driver.find_element_by_xpath('//*[@id="b-scopeListItem-images"]/a').click() | |
#The line of code will keep scrolling down the webpage until it cannot scroll no more | |
last_height = driver.execute_script('return document.body.scrollHeight') | |
while True: | |
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)') | |
time.sleep(2) | |
new_height = driver.execute_script('return document.body.scrollHeight') | |
try: | |
driver.find_element_by_xpath('//*[@id="islmp"]/div/div/div/div/div[5]/input').click() | |
time.sleep(2) | |
except: | |
pass | |
if new_height == last_height: | |
break | |
last_height = new_height | |
for i in range(1, 1000): | |
try: | |
driver.find_element_by_xpath('//*[@id="islrg"]/div[1]/div['+str(i)+']/a[1]/div[1]/img').screenshot('/home/samuel/images/crane/crane_'+str(i)+'.png') | |
#Change the path to save the images | |
except: | |
pass | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment