Last active
October 13, 2015 07:55
-
-
Save ri-sh/a220ee3e90c90bd50d3e to your computer and use it in GitHub Desktop.
Download full sized images using google search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import time | |
import requests | |
from PIL import Image | |
from StringIO import StringIO | |
import socket | |
from requests.exceptions import ConnectionError | |
import urllib2 | |
def go(query, path): | |
"""Download full size images from Google image search. | |
Don't print or republish images without permission. | |
I used this to train a learning algorithm. | |
""" | |
BASE_URL = 'https://ajax.googleapis.com/ajax/services/search/images?'\ | |
'v=1.0&q=' + query + '&start=%d' | |
header = {'User-Agent': 'Mozilla/5.0'} | |
BASE_PATH = os.path.join(path, query) | |
if not os.path.exists(BASE_PATH): | |
os.makedirs(BASE_PATH) | |
print BASE_PATH | |
start = 0 # Google's start query string parameter for pagination. | |
while start < 60: # Google will only return a max of 56 results. | |
r = requests.get(BASE_URL % start) | |
for image_info in json.loads(r.text)['responseData']['results']: | |
url = image_info['unescapedUrl'].replace("%20",'') | |
print url | |
try: | |
image_r = urllib2.urlopen(url).read() | |
except ConnectionError, e: | |
print 'could not download %s' % url | |
continue | |
except urllib2.HTTPError ,e: | |
print "HHTp eror}" | |
continue | |
except urllib2.URLError,e : | |
print "socket error ..........." | |
continue | |
except UnboundLocalError , e: | |
print "UnboundLocalError" | |
except: | |
import sys | |
# prints `type(e), e` where `e` is the last exception | |
print sys.exc_info()[:2] | |
# Remove file-system path characters from name. | |
title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '').replace('.','').encode('utf-8').replace('|','').replace(':','').replace(',','').replace('?','') | |
file = open( BASE_PATH+"\\"+title+'.jpg', 'wb') | |
try: | |
file.write(image_r) | |
#Image.open(StringIO(image_r.content)).save(file, 'JPEG') | |
except IOError, e: | |
# Throw away some gifs...blegh. | |
print 'could not save %s' % url | |
continue | |
finally: | |
file.close() | |
print start | |
start += 4 # 4 images per page. | |
# Be nice to Google and they'll be nice back :) | |
time.sleep(1.5) | |
# Example use | |
go('check', 'C:\\Pictures\\landscape') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment