erogol/scrap_bing.py

## scrap_bing.py
#!/usr/bin/env python


'''
Query on GoogleImageSearch and install resulted images by scraping.

To use this script install mechanize and BeautifulSoup packages as
easy_install mechanize
easy_install Beautiful

Example Run:
installQueriedGoogleImages('spotty')

Eren Golge erengolge@gmail.com - www.erengolge.com - 17 April 2013
'''

import json
import pdb
import urllib
import mechanize
import cookielib
import re
import sys
import os
from BeautifulSoup import BeautifulSoup

def installQueriedGoogleImages(query):
    br = mechanize.Browser()
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    # Browser options
    br.set_handle_equiv(True)
    br.set_handle_gzip(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    # Follows refresh 0 but not hangs on refresh > 0
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    # Want debugging messages?
    #br.set_debug_http(True)
    #br.set_debug_redirects(True)
    #br.set_debug_responses(True)

    # User-Agent (this is cheating, ok?)
    br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

    main_url = 'http://www.bing.com/images/search?q='+query
    r = br.open(main_url)
    counter = 1
    for i in range(6):
        html = r.read()
        soup = BeautifulSoup(html)
        divs = soup.findAll('div',{'class':"dg_u"})
        for div in divs:
            links = div.findAll('a')
            link = links[0]
            text = link['m']
            img_link = re.search('imgurl:"([^"]+)', text).group(1)
            print 'Downloading image %d-%s -...\n'%(counter, img_link)
            try:
#                 pdb.set_trace()
                ext = img_link[-4:]
                # urllib.urlretrieve(img_link, query+'/image'+str(counter)+ext)
                download_photo(img_link, query+'/image'+str(counter)+ext)
            except IOError:
                print 'image %d cannot be downloaded because of server error!...'%counter
            except UnicodeError:
                print 'image %d cannot be downloaded because of naming of website!...'%counter
            counter += 1
            print('df')

#         r = br.open(link)

def download_photo(img_url, filename):
    try:
        image_on_web = urllib.urlopen(img_url)
        if image_on_web.headers.maintype == 'image':
            buf = image_on_web.read()
            downloaded_image = file(filename, "wb")
            downloaded_image.write(buf)
            downloaded_image.close()
            image_on_web.close()
        else:
            return False
    except:
        return False
    return True

if __name__ == '__main__':
    iteration_num = len(sys.argv)-1
    for i in range(iteration_num):
        color= sys.argv[i+1]
        if not os.path.exists(color):
            os.makedirs(color)
        installQueriedGoogleImages(color)
	#!/usr/bin/env python


	'''
	Query on GoogleImageSearch and install resulted images by scraping.

	To use this script install mechanize and BeautifulSoup packages as
	easy_install mechanize
	easy_install Beautiful

	Example Run:
	installQueriedGoogleImages('spotty')

	Eren Golge erengolge@gmail.com - www.erengolge.com - 17 April 2013
	'''

	import json
	import pdb
	import urllib
	import mechanize
	import cookielib
	import re
	import sys
	import os
	from BeautifulSoup import BeautifulSoup

	def installQueriedGoogleImages(query):
	br = mechanize.Browser()
	cj = cookielib.LWPCookieJar()
	br.set_cookiejar(cj)

	# Browser options
	br.set_handle_equiv(True)
	br.set_handle_gzip(True)
	br.set_handle_redirect(True)
	br.set_handle_referer(True)
	br.set_handle_robots(False)

	# Follows refresh 0 but not hangs on refresh > 0
	br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

	# Want debugging messages?
	#br.set_debug_http(True)
	#br.set_debug_redirects(True)
	#br.set_debug_responses(True)

	# User-Agent (this is cheating, ok?)
	br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

	main_url = 'http://www.bing.com/images/search?q='+query
	r = br.open(main_url)
	counter = 1
	for i in range(6):
	html = r.read()
	soup = BeautifulSoup(html)
	divs = soup.findAll('div',{'class':"dg_u"})
	for div in divs:
	links = div.findAll('a')
	link = links[0]
	text = link['m']
	img_link = re.search('imgurl:"([^"]+)', text).group(1)
	print 'Downloading image %d-%s -...\n'%(counter, img_link)
	try:
	# pdb.set_trace()
	ext = img_link[-4:]
	# urllib.urlretrieve(img_link, query+'/image'+str(counter)+ext)
	download_photo(img_link, query+'/image'+str(counter)+ext)
	except IOError:
	print 'image %d cannot be downloaded because of server error!...'%counter
	except UnicodeError:
	print 'image %d cannot be downloaded because of naming of website!...'%counter
	counter += 1
	print('df')

	# r = br.open(link)

	def download_photo(img_url, filename):
	try:
	image_on_web = urllib.urlopen(img_url)
	if image_on_web.headers.maintype == 'image':
	buf = image_on_web.read()
	downloaded_image = file(filename, "wb")
	downloaded_image.write(buf)
	downloaded_image.close()
	image_on_web.close()
	else:
	return False
	except:
	return False
	return True

	if __name__ == '__main__':
	iteration_num = len(sys.argv)-1
	for i in range(iteration_num):
	color= sys.argv[i+1]
	if not os.path.exists(color):
	os.makedirs(color)
	installQueriedGoogleImages(color)