Skip to content

Instantly share code, notes, and snippets.

@usunyu
Created August 13, 2016 06:47
Show Gist options
  • Save usunyu/9a4d56269b2bdbc88f5a1410d2ec7cf9 to your computer and use it in GitHub Desktop.
Save usunyu/9a4d56269b2bdbc88f5a1410d2ec7cf9 to your computer and use it in GitHub Desktop.
Search and download content from web
import json
import os
import time
import requests
from PIL import Image
from StringIO import StringIO
from requests.exceptions import ConnectionError
def go(query, folder, path):
"""Download full size images from Google image search.
Don't print or republish images without permission.
I used this to train a learning algorithm.
"""
BASE_URL = 'https://ajax.googleapis.com/ajax/services/search/images?'\
'v=1.0&q=' + query + '&start=%d'
BASE_PATH = os.path.join(path, folder)
if not os.path.exists(BASE_PATH):
os.makedirs(BASE_PATH)
start = 0 # Google's start query string parameter for pagination.
while start < 60: # Google will only return a max of 56 results.
r = requests.get(BASE_URL % start)
for image_info in json.loads(r.text)['responseData']['results']:
url = image_info['unescapedUrl']
try:
image_r = requests.get(url)
except ConnectionError, e:
print 'could not download %s' % url
continue
# Remove file-system path characters from name.
title = image_info['titleNoFormatting'].replace('/', '').replace('\\', '')
file = open(os.path.join(BASE_PATH, '%s.jpg') % title, 'w')
try:
Image.open(StringIO(image_r.content)).save(file, 'JPEG')
except IOError, e:
# Throw away some gifs...blegh.
print 'could not save %s' % url
os.remove(os.path.join(BASE_PATH, '%s.jpg') % title)
continue
finally:
file.close()
print start
start += 4 # 4 images per page.
# Be nice to Google and they'll be nice back :)
time.sleep(1.5)
# Example use
go('University of Southern California Campus', 'usc', 'campus')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment