Skip to content

Instantly share code, notes, and snippets.

@ri-sh
Last active October 13, 2015 07:55
Show Gist options
  • Save ri-sh/a220ee3e90c90bd50d3e to your computer and use it in GitHub Desktop.
Save ri-sh/a220ee3e90c90bd50d3e to your computer and use it in GitHub Desktop.
Download full sized images using google search
import json
import os
import time
import requests
from PIL import Image
from StringIO import StringIO
import socket
from requests.exceptions import ConnectionError
import urllib2
def go(query, path):
"""Download full size images from Google image search.
Don't print or republish images without permission.
I used this to train a learning algorithm.
"""
BASE_URL = 'https://ajax.googleapis.com/ajax/services/search/images?'\
'v=1.0&q=' + query + '&start=%d'
header = {'User-Agent': 'Mozilla/5.0'}
BASE_PATH = os.path.join(path, query)
if not os.path.exists(BASE_PATH):
os.makedirs(BASE_PATH)
print BASE_PATH
start = 0 # Google's start query string parameter for pagination.
while start < 60: # Google will only return a max of 56 results.
r = requests.get(BASE_URL % start)
for image_info in json.loads(r.text)['responseData']['results']:
url = image_info['unescapedUrl'].replace("%20",'')
print url
try:
image_r = urllib2.urlopen(url).read()
except ConnectionError, e:
print 'could not download %s' % url
continue
except urllib2.HTTPError ,e:
print "HHTp eror}"
continue
except urllib2.URLError,e :
print "socket error ..........."
continue
except UnboundLocalError , e:
print "UnboundLocalError"
except:
import sys
# prints `type(e), e` where `e` is the last exception
print sys.exc_info()[:2]
# Remove file-system path characters from name.
title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '').replace('.','').encode('utf-8').replace('|','').replace(':','').replace(',','').replace('?','')
file = open( BASE_PATH+"\\"+title+'.jpg', 'wb')
try:
file.write(image_r)
#Image.open(StringIO(image_r.content)).save(file, 'JPEG')
except IOError, e:
# Throw away some gifs...blegh.
print 'could not save %s' % url
continue
finally:
file.close()
print start
start += 4 # 4 images per page.
# Be nice to Google and they'll be nice back :)
time.sleep(1.5)
# Example use
go('check', 'C:\\Pictures\\landscape')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment