Skip to content

Instantly share code, notes, and snippets.

@genekogan
Created February 22, 2017 11:49
Show Gist options
  • Save genekogan/ebd77196e4bf0705db51f86431099e57 to your computer and use it in GitHub Desktop.
Save genekogan/ebd77196e4bf0705db51f86431099e57 to your computer and use it in GitHub Desktop.
scraping full size images from Google Images
from bs4 import BeautifulSoup
import requests
import re
import urllib2
import os
import argparse
import sys
import json
# adapted from http://stackoverflow.com/questions/20716842/python-download-images-from-google-image-search
def get_soup(url,header):
return BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)),'html.parser')
def main(args):
parser = argparse.ArgumentParser(description='Scrape Google images')
parser.add_argument('-s', '--search', default='bananas', type=str, help='search term')
parser.add_argument('-n', '--num_images', default=10, type=int, help='num images to save')
parser.add_argument('-d', '--directory', default='/Users/gene/Downloads/', type=str, help='save directory')
args = parser.parse_args()
query = args.search#raw_input(args.search)
max_images = args.num_images
save_directory = args.directory
image_type="Action"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36"}
soup = get_soup(url,header)
ActualImages=[]# contains the link for Large original images, type of image
for a in soup.find_all("div",{"class":"rg_meta"}):
link , Type =json.loads(a.text)["ou"] ,json.loads(a.text)["ity"]
ActualImages.append((link,Type))
for i , (img , Type) in enumerate( ActualImages[0:max_images]):
try:
req = urllib2.Request(img, headers={'User-Agent' : header})
raw_img = urllib2.urlopen(req).read()
if len(Type)==0:
f = open(os.path.join(save_directory , "img" + "_"+ str(i)+".jpg"), 'wb')
else :
f = open(os.path.join(save_directory , "img" + "_"+ str(i)+"."+Type), 'wb')
f.write(raw_img)
f.close()
except Exception as e:
print "could not load : "+img
print e
if __name__ == '__main__':
from sys import argv
try:
main(argv)
except KeyboardInterrupt:
pass
sys.exit()
@Gucci44600
Copy link

Sorry, I tried to keep the identation of the code but it doesn't work

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment