Skip to content

Instantly share code, notes, and snippets.

@genekogan
Created February 22, 2017 11:49
Show Gist options
  • Save genekogan/ebd77196e4bf0705db51f86431099e57 to your computer and use it in GitHub Desktop.
Save genekogan/ebd77196e4bf0705db51f86431099e57 to your computer and use it in GitHub Desktop.
scraping full size images from Google Images
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
import argparse
import sys
import json
# adapted from http://stackoverflow.com/questions/20716842/python-download-images-from-google-image-search
def get_soup(url,header):
return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser')
def main(args):
parser = argparse.ArgumentParser(description='Scrape Google images')
parser.add_argument('-s', '--search', default='bananas', type=str, help='search term')
parser.add_argument('-n', '--num_images', default=10, type=int, help='num images to save')
parser.add_argument('-d', '--directory', default='/Users/gene/Downloads/', type=str, help='save directory')
args = parser.parse_args()
query = args.search#raw_input(args.search)
max_images = args.num_images
save_directory = args.directory
image_type="Action"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"]
ActualImages.append((link,Type))
for i , (img , Type) in enumerate( ActualImages[0:max_images]):
try:
req = urllib2.Request(img, headers={'User-Agent' : header})
raw_img = urllib2.urlopen(req).read()
if len(Type)==0:
f = open(os.path.join(save_directory , "img" + "_"+ str(i)+".jpg"), 'wb')
else :
f = open(os.path.join(save_directory , "img" + "_"+ str(i)+"."+Type), 'wb')
f.write(raw_img)
f.close()
except Exception as e:
print "could not load : "+img
print e
if __name__ == '__main__':
from sys import argv
try:
main(argv)
except KeyboardInterrupt:
pass
sys.exit()
@Gucci44600
Copy link

Sorry, I tried to keep the identation of the code but it doesn't work

@poa00
Copy link

poa00 commented May 23, 2024

Sorry, I tried to keep the indentation of the code but it doesn't work

That's because it looks like you used only one backtick above and below instead of three. Also, tip for you, after the first three backticks, identify the language so that there will be syntax highlighting like so:

```python

<your code here>

```

Your code would look like this (plus my best guess at correct indentations) then:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time
import requests
import shutil
import os
import argparse


def save_img(inp,img,i, directory):
try:
  filename = inp+str(i)+'.jpg'
  response = requests.get(img,stream=True)
  image_path = os.path.join(directory, filename)
  with open(image_path, 'wb') as file:
    shutil.copyfileobj(response.raw, file)
except Exception:
  pass


def find_urls(inp,url,driver, directory):
driver.get(url)
img_urls = driver.find_elements(By.XPATH,'//img[contains(@Class,"rg_i yDmH0d")]')
#img_urls = driver.find_elements(By.XPATH,'//*[@id="yDmH0d"]')
for _ in range(500):
  driver.execute_script("window.scrollBy(0,10000)")
  try:
    driver.find_element_by_css_selector('.mye4qd').click()
  except:
    continue
    for j, imgurl in enumerate(img_urls):
      try:
        img = imgurl.get_attribute('src')
        save_img(inp,img,j, directory)
        time.sleep(1.5)
      except:
        pass
        s=Service('C:\chromedriver.exe')
        if name == "main":
          parser = argparse.ArgumentParser(description='Scrape Google images')
          parser.add_argument('-s', '--search', default='banana', type=str, help='search term')
          parser.add_argument('-d', '--directory', default='../Downloads/', type=str, help='save directory')
          args = parser.parse_args()
          driver = webdriver.Chrome(service=s)
          directory = args.directory
          inp = args.search
        if not os.path.isdir(directory):
          os.makedirs(directory)
          url = 'https://www.google.com/search?
q=french+pedestrian+light&rlz=1C1CHBF_frFR1008FR1008&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiGgui61IX9AhVaQaQEHeS4CkgQ_AUoAXoECAEQAw&biw=1536&bih=714&dpr=1.25'

find_urls(inp,url,driver, directory)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment