Skip to content

Instantly share code, notes, and snippets.

@erogol
Created September 22, 2013 10:57
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save erogol/6658881 to your computer and use it in GitHub Desktop.
Save erogol/6658881 to your computer and use it in GitHub Desktop.
scrap google images given the target query
#!/usr/bin/env python
'''
Query on GoogleImageSearch and install resulted images by scraping.
To use this script install mechanize and BeautifulSoup packages as
easy_install mechanize
easy_install Beautiful
Example Run:
installQueriedGoogleImages('spotty')
Eren Golge erengolge@gmail.com - www.erengolge.com - 17 April 2013
'''
import json
import pdb
import urllib
import mechanize
import cookielib
import sys
import os
from BeautifulSoup import BeautifulSoup
def installQueriedGoogleImages(query):
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
main_url = 'https://www.google.com.tr/search?q='+query+'&safe=off&client=ubuntu&hs=Hrw&channel=fs&biw=1317&bih=678&hl=en&um=1&ie=UTF-8&tbm=isch&source=og&sa=N&tab=wi'
r = br.open(main_url)
counter = 1
for i in range(5):
html = r.read()
soup = BeautifulSoup(html)
for link in br.links():
if 'IMG' in link.text:
r = br.open(link.url)
html = r.read()
soup2 = BeautifulSoup(html) # parse html
links = soup2.findAll('a')
for target in links:
if 'Full-size image' in str(target.getString()):
link_img = target['href']
print 'Downloading image %d-%s -...\n'%(counter, link_img)
try:
#pdb.set_trace()
ext = link_img[-4:]
urllib.urlretrieve(link_img, query+'/image'+str(counter)+ext)
except (IOError):
print 'image %d cannot be downloaded because of server error!...'%counter
except UnicodeError:
print 'image %d cannot be downloaded because of naming of website!...'%counter
counter += 1
spans = soup.findAll('span')
for next in spans:
if 'Next' in str(next.getString()):
link = next.parent
link = link['href']
r = br.open(link)
if __name__ == '__main__':
color= sys.argv[1]
if not os.path.exists(color):
os.makedirs(color)
installQueriedGoogleImages(color)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment