Skip to content

Instantly share code, notes, and snippets.

@erogol
Created May 29, 2014 10:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save erogol/332bc60ce694a87e9b35 to your computer and use it in GitHub Desktop.
Save erogol/332bc60ce694a87e9b35 to your computer and use it in GitHub Desktop.
bing scrabber but add more to query address
#!/bin/bash
from bs4 import BeautifulSoup
import requests
import urllib2
import os
import re, urlparse
import time
import pdb
from interruptingcow import timeout
import cPickle
from multiprocessing import Pool
from subprocess import Popen, PIPE
from collections import Counter
'''
TODO:
-> Some of the donwloaded images are mal_formatted files. Correct This
-> Some download links are internal Bing links so Bing pages also downloaded. Correct This
-> get_link() defines root_url in itself. Give it as an argument
-> if link list is written before do not scrap again
'''
def chunk(l, n):
if n<1:
n=1
return [l[i:i+n] for i in range(0, len(l), n)]
def run(cmd):
print '-'*40
print 'running:', cmd
p = Popen(cmd, stderr=PIPE, stdout=PIPE, shell=True)
output, errors = p.communicate()
print [p.returncode, errors, output]
if p.returncode or errors:
print 'something went wrong...'
def get_soup(url):
# some times get gives socket error.
try:
r = requests.get(url)
except:
return None
if r.status_code == 200:
return BeautifulSoup(r.text)
else:
return None
# handle non-ascii characters
def urlEncodeNonAscii(b):
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
def iriToUri(iri):
parts= urlparse.urlparse(iri)
return urlparse.urlunparse(
part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
for parti, part in enumerate(parts)
)
def full_run(query, output_path, output_img_paths, num_imgs = 800):
name = query
root_url = 'http://m.bing.com'
raw_query = query.replace(' ','_')
# query = query.replace(' ','+') + '+face'
query = query.replace(' ','+')
print query
img_link_file_path = output_img_paths+query+'_links.txt'
# if link file is exist do not crawle
# if os.path.exists(query+'.txt'):
# Collect real img links from img show ups of Bing
#p = Pool(100)
#query_list = zip([query]*num_imgs, range(num_imgs))
#img_links = p.map(get_link, query_list)
if os.path.exists(img_link_file_path):
f = open(img_link_file_path, 'r')
img_links = f.readlines()
f.close()
else:
img_links = []
bad_img_links = [] # keep malfuncitoning links
img_counter = 0
raw_counter = -1
old_links = Counter()
while img_counter < num_imgs:
raw_counter += 1
print img_counter, ' of query ', query
with timeout(20, exception=RuntimeError):
try:
link = get_link([query, raw_counter])
except TimeoutError:
print 'Timeout!!!!'
img_counter += 1
continue
# wait!! maybe Bing banned you
if link == None:
time.sleep(5)
img_counter += 1
continue
try:
con = urllib2.urlopen(link)
except:
bad_img_links.append(link)
img_counter += 1
continue
# if link does not work do not count
if con.getcode() != 200:
bad_img_links.append(link)
img_counter += 1
continue
print "link ", link
# time.sleep(0.5)
if old_links[link] == 0:
img_links.append(link)
img_counter += 1
old_links[link]+=1;
else:
print "Duplicate Link!!"
img_counter += 1
continue
# Save img_links to a file
f = open(img_link_file_path, 'w')
for img_link in img_links:
if img_link != None:
f.write("%s\n" % iriToUri(img_link))
f.close()
# Save bad links
f = open(output_img_paths+query+'_bad_inks.txt', 'w')
for img_link in bad_img_links:
if img_link != None:
f.write("%s\n" % iriToUri(img_link))
f.close()
# Create root class folder if not exists
try:
fold_path = output_path+raw_query
if ~os.path.exists(fold_path):
os.makedirs(fold_path)
except:
pass
# pdb.set_trace()
for count,img in enumerate(img_links):
if img != None:
#print img
img = str.strip(img)
out_path = fold_path+"/"+ str(count) + ".jpg"
# command = 'wget -O '+out_path+' -o download.log -A.jpeg,.jpg -b ' + iriToUri(img)
img = iriToUri(img)
print img, ' to be donwloaded'
command = 'wget -O '+out_path+' -t 1 -o download.log --timeout=600 ' + img
# os.system(command)
run(command)
time.sleep(2)
else:
#print img
print 'IS NONE!!!'
#raw_img = urllib2.urlopen(img).read()
# cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
# f = open("images/" + image_type + "_"+ str(count), 'wb')
# f.write(raw_img)
# f.close()
return True
def gather_img_files(query, output_path, output_img_paths, num_imgs = 800):
name = query
root_url = 'http://m.bing.com'
raw_query = query.replace(' ','_')
# query = query.replace(' ','+') + '+face'
query = query.replace(' ','+')
print query
img_link_file_path = output_img_paths+query+'_links.txt'
img_links = []
bad_img_links = [] # keep malfuncitoning links
img_counter = 0
raw_counter = -1
old_links = Counter()
while img_counter < num_imgs:
raw_counter += 1
try:
with timeout(5, exception=RuntimeError):
print img_counter, ' of query ', query
link = get_link([query, raw_counter])
# wait!! maybe Bing banned you
if link == None:
time.sleep(5)
img_counter += 1
continue
# try:
# print 'Checking Link!!!'
# con = urllib2.urlopen(link)
# except:
# bad_img_links.append(link)
# img_counter += 1
# continue
# # if link does not work do not count
# if con.getcode() != 200:
# bad_img_links.append(link)
# img_counter += 1
# continue
print "link ", link
# time.sleep(0.5)
if old_links[link] == 0:
img_links.append(link)
img_counter += 1
old_links[link]+=1;
else:
print "Duplicate Link!!"
img_counter += 1
continue
except RuntimeError:
print 'Timeout!!!!'
img_counter += 1
continue
# Save img_links to a file
f = open(img_link_file_path, 'w')
for img_link in img_links:
if img_link != None:
f.write("%s\n" % iriToUri(img_link))
f.close()
# Save bad links
f = open(output_img_paths+query+'_bad_inks.txt', 'w')
for img_link in bad_img_links:
if img_link != None:
f.write("%s\n" % iriToUri(img_link))
f.close()
# slave code for multi-rocessing
def get_link(args):
# url = "http://m.bing.com/images/more?q="+args[0]+"&ii="+str(args[1])+"&dv=True&form=IGSIGS&IIG=c2a0b6a0c2ab4b179a7c565fa914d169&kval=3.1&AppNs=mSERP"
url = "http://m.bing.com/images/q="+args[0]+"&ii="+str(args[1])+"&dv=True"
print url
soup = get_soup(url)
if soup != None:
# if no linke retrieved break iteration
matches = [a['href'] for a in soup.find_all("a", {"href":re.compile("http:")})]
if len(matches) > 0:
img_link = matches[0]
img_link = img_link.split('?')[0]
return img_link
else:
return None # link has some error
else:
return None # page gives some error
if __name__ == "__main__":
chunk_no = 1;
output_path = "/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_images/"
output_img_paths = '/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_image_urls/'
# folders = os.listdir('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/pubfig83/images')
# names = [folder.replace('_', ' ') for folder in folders];
f = open('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/all_names.txt','r')
names = f.readlines()
# name_chunks = chunk(names,5);
# names = name_chunks[chunk_no-1]
# f = open('/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/Dataset_matlab/all_names.txt','r')
# names = f.readlines()
LAST_NAME = 'nikolay davydenko'
cont_flag = False
for name in names:
name = name.strip()
# folder_name = name.replace(" ","_")
if cont_flag:
gather_img_files(name, output_path, output_img_paths, 800)
if name == LAST_NAME:
cont_flag = True
for name in names:
name = name.strip()
full_run(name, output_path, output_img_paths, 800)
# full_run('tiger')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment