Skip to content

Instantly share code, notes, and snippets.

@erogol
Last active December 23, 2015 15:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erogol/6659566 to your computer and use it in GitHub Desktop.
Save erogol/6659566 to your computer and use it in GitHub Desktop.
Crawle and scrap Bing Image seach images
#!/usr/bin/env python
'''
Query on GoogleImageSearch and install resulted images by scraping.
To use this script install mechanize and BeautifulSoup packages as
easy_install mechanize
easy_install Beautiful
Example Run:
installQueriedGoogleImages('spotty')
Eren Golge erengolge@gmail.com - www.erengolge.com - 17 April 2013
'''
import json
import pdb
import urllib
import mechanize
import cookielib
import re
import sys
import os
from BeautifulSoup import BeautifulSoup
def installQueriedGoogleImages(query):
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Want debugging messages?
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
main_url = 'http://www.bing.com/images/search?q='+query
r = br.open(main_url)
counter = 1
for i in range(6):
html = r.read()
soup = BeautifulSoup(html)
divs = soup.findAll('div',{'class':"dg_u"})
for div in divs:
links = div.findAll('a')
link = links[0]
text = link['m']
img_link = re.search('imgurl:"([^"]+)', text).group(1)
print 'Downloading image %d-%s -...\n'%(counter, img_link)
try:
# pdb.set_trace()
ext = img_link[-4:]
# urllib.urlretrieve(img_link, query+'/image'+str(counter)+ext)
download_photo(img_link, query+'/image'+str(counter)+ext)
except IOError:
print 'image %d cannot be downloaded because of server error!...'%counter
except UnicodeError:
print 'image %d cannot be downloaded because of naming of website!...'%counter
counter += 1
print('df')
# r = br.open(link)
def download_photo(img_url, filename):
try:
image_on_web = urllib.urlopen(img_url)
if image_on_web.headers.maintype == 'image':
buf = image_on_web.read()
downloaded_image = file(filename, "wb")
downloaded_image.write(buf)
downloaded_image.close()
image_on_web.close()
else:
return False
except:
return False
return True
if __name__ == '__main__':
iteration_num = len(sys.argv)-1
for i in range(iteration_num):
color= sys.argv[i+1]
if not os.path.exists(color):
os.makedirs(color)
installQueriedGoogleImages(color)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment