erogol/bing_scrap.py

## bing_scrap.py
#!/bin/bash
from bs4 import BeautifulSoup
import requests
import urllib2
import os
import re, urlparse
import time
import pdb
from interruptingcow import timeout

import cPickle
from multiprocessing import Pool
from subprocess import Popen, PIPE
from collections import Counter

'''
 TODO:
  -> Some of the donwloaded images are mal_formatted files. Correct This
  -> Some download links are internal Bing links so Bing pages also downloaded. Correct This
  -> get_link() defines root_url in itself. Give it as an argument
  -> if link list is written before do not scrap again
'''

def chunk(l, n):
    if n<1:
        n=1
    return [l[i:i+n] for i in range(0, len(l), n)]

def run(cmd):
    print '-'*40
    print 'running:', cmd
    p = Popen(cmd, stderr=PIPE, stdout=PIPE, shell=True)
    output, errors = p.communicate()
    print [p.returncode, errors, output]
    if p.returncode or errors:
        print 'something went wrong...'

def get_soup(url):
	# some times get gives socket error.
	try:
		r = requests.get(url)
	except:
		return None
	if r.status_code == 200:
		return BeautifulSoup(r.text)
	else:
		return None

# handle non-ascii characters
def urlEncodeNonAscii(b):
    return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)

def iriToUri(iri):
    parts= urlparse.urlparse(iri)
    return urlparse.urlunparse(
        part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
        for parti, part in enumerate(parts)
    )

def full_run(query, output_path, output_img_paths, num_imgs = 800):
	name = query
	root_url = 'http://m.bing.com'
	raw_query = query.replace(' ','_')
	# query = query.replace(' ','+') + '+face'
	query = query.replace(' ','+')
	print query
	img_link_file_path = output_img_paths+query+'_links.txt'


	# if link file is exist do not crawle
	# if os.path.exists(query+'.txt'):

	# Collect real img links from img show ups of Bing

	#p = Pool(100)
	#query_list = zip([query]*num_imgs, range(num_imgs))
	#img_links = p.map(get_link, query_list)

	if os.path.exists(img_link_file_path):
		f = open(img_link_file_path, 'r')
		img_links = f.readlines()
		f.close()
	else:
		img_links = []
		bad_img_links = [] # keep malfuncitoning links
		img_counter = 0
		raw_counter = -1
		old_links = Counter()
		while img_counter < num_imgs:
			raw_counter += 1

			print img_counter, ' of query ', query
			with timeout(20, exception=RuntimeError):
				try:
					link = get_link([query, raw_counter])
				except TimeoutError:
					print 'Timeout!!!!'
					img_counter += 1
					continue

			# wait!! maybe Bing banned you
			if link == None:
				time.sleep(5)
				img_counter += 1
				continue

			try:
				con = urllib2.urlopen(link)
			except:
				bad_img_links.append(link)
				img_counter += 1
				continue

			# if link does not work do not count
			if con.getcode() != 200:
				bad_img_links.append(link)
				img_counter += 1
				continue

			print "link ", link
			# time.sleep(0.5)
			if old_links[link] == 0:
				img_links.append(link)
				img_counter += 1
				old_links[link]+=1;
			else:
				print "Duplicate Link!!"
				img_counter += 1
				continue


		# Save img_links to a file
		f = open(img_link_file_path, 'w')
		for img_link in img_links:
			if img_link != None:
				f.write("%s\n" % iriToUri(img_link))
		f.close()

		# Save bad links
		f = open(output_img_paths+query+'_bad_inks.txt', 'w')
		for img_link in bad_img_links:
			if img_link != None:
				f.write("%s\n" % iriToUri(img_link))
		f.close()

	# Create root class folder if not exists
	try:
		fold_path = output_path+raw_query
		if ~os.path.exists(fold_path):
			os.makedirs(fold_path)
	except:
		pass

	# pdb.set_trace()
	for count,img in enumerate(img_links):
		if img != None:
			#print img
			img = str.strip(img)
			out_path = fold_path+"/"+ str(count) + ".jpg"
		 	# command = 'wget -O '+out_path+' -o download.log -A.jpeg,.jpg -b ' + iriToUri(img)
		 	img = iriToUri(img)
		 	print img, ' to be donwloaded'
		 	command = 'wget -O '+out_path+' -t 1 -o download.log  --timeout=600 ' + img
			# os.system(command)
			run(command)
			time.sleep(2)
		else:
			#print img
			print 'IS NONE!!!'
		#raw_img = urllib2.urlopen(img).read()
		# cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
		# f = open("images/" + image_type + "_"+ str(count), 'wb')
		# f.write(raw_img)
		# f.close()
	return True

def gather_img_files(query, output_path, output_img_paths, num_imgs = 800):
	name = query
	root_url = 'http://m.bing.com'
	raw_query = query.replace(' ','_')
	# query = query.replace(' ','+') + '+face'
	query = query.replace(' ','+')
	print query
	img_link_file_path = output_img_paths+query+'_links.txt'

	img_links = []
	bad_img_links = [] # keep malfuncitoning links
	img_counter = 0
	raw_counter = -1
	old_links = Counter()
	while img_counter < num_imgs:
		raw_counter += 1
		try:
			with timeout(5, exception=RuntimeError):
				print img_counter, ' of query ', query
				link = get_link([query, raw_counter])

				# wait!! maybe Bing banned you
				if link == None:
					time.sleep(5)
					img_counter += 1
					continue

				# try:
				# 	print 'Checking Link!!!'
				# 	con = urllib2.urlopen(link)
				# except:
				# 	bad_img_links.append(link)
				# 	img_counter += 1
				# 	continue

				# # if link does not work do not count
				# if con.getcode() != 200:
				# 	bad_img_links.append(link)
				# 	img_counter += 1
				# 	continue

				print "link ", link
				# time.sleep(0.5)
				if old_links[link] == 0:
					img_links.append(link)
					img_counter += 1
					old_links[link]+=1;
				else:
					print "Duplicate Link!!"
					img_counter += 1
					continue
		except RuntimeError:
			print 'Timeout!!!!'
			img_counter += 1
			continue


	# Save img_links to a file
	f = open(img_link_file_path, 'w')
	for img_link in img_links:
		if img_link != None:
			f.write("%s\n" % iriToUri(img_link))
	f.close()

	# Save bad links
	f = open(output_img_paths+query+'_bad_inks.txt', 'w')
	for img_link in bad_img_links:
		if img_link != None:
			f.write("%s\n" % iriToUri(img_link))
	f.close()


# slave code for multi-rocessing
def get_link(args):
	# url = "http://m.bing.com/images/more?q="+args[0]+"&ii="+str(args[1])+"&dv=True&form=IGSIGS&IIG=c2a0b6a0c2ab4b179a7c565fa914d169&kval=3.1&AppNs=mSERP"
	url = "http://m.bing.com/images/q="+args[0]+"&ii="+str(args[1])+"&dv=True"
	print url
	soup = get_soup(url)
	if soup != None:
		# if no linke retrieved break iteration
		matches = [a['href'] for a in soup.find_all("a", {"href":re.compile("http:")})]
		if len(matches) > 0:
			img_link = matches[0]
			img_link = img_link.split('?')[0]
			return img_link
		else:
			return None # link has some error
	else:
		return None # page gives some error


if __name__ == "__main__":
	chunk_no = 1;

	output_path =  "/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_images/"
	output_img_paths = '/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_image_urls/'
	# folders = os.listdir('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/pubfig83/images')
	# names = [folder.replace('_', ' ') for folder in folders];
	f = open('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/all_names.txt','r')
	names = f.readlines()
	# name_chunks = chunk(names,5);
	# names = name_chunks[chunk_no-1]


	# f = open('/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/Dataset_matlab/all_names.txt','r')
	# names = f.readlines()
	LAST_NAME = 'nikolay davydenko'
	cont_flag = False
	for name in names:
		name = name.strip()
		# folder_name = name.replace(" ","_")
		if cont_flag:
			gather_img_files(name, output_path, output_img_paths, 800)
		if name == LAST_NAME:
			cont_flag = True

	for name in names:
		name = name.strip()
		full_run(name, output_path, output_img_paths, 800)

	# full_run('tiger')
	#!/bin/bash
	from bs4 import BeautifulSoup
	import requests
	import urllib2
	import os
	import re, urlparse
	import time
	import pdb
	from interruptingcow import timeout

	import cPickle
	from multiprocessing import Pool
	from subprocess import Popen, PIPE
	from collections import Counter

	'''
	TODO:
	-> Some of the donwloaded images are mal_formatted files. Correct This
	-> Some download links are internal Bing links so Bing pages also downloaded. Correct This
	-> get_link() defines root_url in itself. Give it as an argument
	-> if link list is written before do not scrap again
	'''

	def chunk(l, n):
	if n<1:
	n=1
	return [l[i:i+n] for i in range(0, len(l), n)]

	def run(cmd):
	print '-'*40
	print 'running:', cmd
	p = Popen(cmd, stderr=PIPE, stdout=PIPE, shell=True)
	output, errors = p.communicate()
	print [p.returncode, errors, output]
	if p.returncode or errors:
	print 'something went wrong...'

	def get_soup(url):
	# some times get gives socket error.
	try:
	r = requests.get(url)
	except:
	return None
	if r.status_code == 200:
	return BeautifulSoup(r.text)
	else:
	return None

	# handle non-ascii characters
	def urlEncodeNonAscii(b):
	return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)

	def iriToUri(iri):
	parts= urlparse.urlparse(iri)
	return urlparse.urlunparse(
	part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
	for parti, part in enumerate(parts)
	)

	def full_run(query, output_path, output_img_paths, num_imgs = 800):
	name = query
	root_url = 'http://m.bing.com'
	raw_query = query.replace(' ','_')
	# query = query.replace(' ','+') + '+face'
	query = query.replace(' ','+')
	print query
	img_link_file_path = output_img_paths+query+'_links.txt'


	# if link file is exist do not crawle
	# if os.path.exists(query+'.txt'):

	# Collect real img links from img show ups of Bing

	#p = Pool(100)
	#query_list = zip([query]*num_imgs, range(num_imgs))
	#img_links = p.map(get_link, query_list)

	if os.path.exists(img_link_file_path):
	f = open(img_link_file_path, 'r')
	img_links = f.readlines()
	f.close()
	else:
	img_links = []
	bad_img_links = [] # keep malfuncitoning links
	img_counter = 0
	raw_counter = -1
	old_links = Counter()
	while img_counter < num_imgs:
	raw_counter += 1

	print img_counter, ' of query ', query
	with timeout(20, exception=RuntimeError):
	try:
	link = get_link([query, raw_counter])
	except TimeoutError:
	print 'Timeout!!!!'
	img_counter += 1
	continue

	# wait!! maybe Bing banned you
	if link == None:
	time.sleep(5)
	img_counter += 1
	continue

	try:
	con = urllib2.urlopen(link)
	except:
	bad_img_links.append(link)
	img_counter += 1
	continue

	# if link does not work do not count
	if con.getcode() != 200:
	bad_img_links.append(link)
	img_counter += 1
	continue

	print "link ", link
	# time.sleep(0.5)
	if old_links[link] == 0:
	img_links.append(link)
	img_counter += 1
	old_links[link]+=1;
	else:
	print "Duplicate Link!!"
	img_counter += 1
	continue


	# Save img_links to a file
	f = open(img_link_file_path, 'w')
	for img_link in img_links:
	if img_link != None:
	f.write("%s\n" % iriToUri(img_link))
	f.close()

	# Save bad links
	f = open(output_img_paths+query+'_bad_inks.txt', 'w')
	for img_link in bad_img_links:
	if img_link != None:
	f.write("%s\n" % iriToUri(img_link))
	f.close()

	# Create root class folder if not exists
	try:
	fold_path = output_path+raw_query
	if ~os.path.exists(fold_path):
	os.makedirs(fold_path)
	except:
	pass

	# pdb.set_trace()
	for count,img in enumerate(img_links):
	if img != None:
	#print img
	img = str.strip(img)
	out_path = fold_path+"/"+ str(count) + ".jpg"
	# command = 'wget -O '+out_path+' -o download.log -A.jpeg,.jpg -b ' + iriToUri(img)
	img = iriToUri(img)
	print img, ' to be donwloaded'
	command = 'wget -O '+out_path+' -t 1 -o download.log --timeout=600 ' + img
	# os.system(command)
	run(command)
	time.sleep(2)
	else:
	#print img
	print 'IS NONE!!!'
	#raw_img = urllib2.urlopen(img).read()
	# cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
	# f = open("images/" + image_type + "_"+ str(count), 'wb')
	# f.write(raw_img)
	# f.close()
	return True

	def gather_img_files(query, output_path, output_img_paths, num_imgs = 800):
	name = query
	root_url = 'http://m.bing.com'
	raw_query = query.replace(' ','_')
	# query = query.replace(' ','+') + '+face'
	query = query.replace(' ','+')
	print query
	img_link_file_path = output_img_paths+query+'_links.txt'

	img_links = []
	bad_img_links = [] # keep malfuncitoning links
	img_counter = 0
	raw_counter = -1
	old_links = Counter()
	while img_counter < num_imgs:
	raw_counter += 1
	try:
	with timeout(5, exception=RuntimeError):
	print img_counter, ' of query ', query
	link = get_link([query, raw_counter])

	# wait!! maybe Bing banned you
	if link == None:
	time.sleep(5)
	img_counter += 1
	continue

	# try:
	# print 'Checking Link!!!'
	# con = urllib2.urlopen(link)
	# except:
	# bad_img_links.append(link)
	# img_counter += 1
	# continue

	# # if link does not work do not count
	# if con.getcode() != 200:
	# bad_img_links.append(link)
	# img_counter += 1
	# continue

	print "link ", link
	# time.sleep(0.5)
	if old_links[link] == 0:
	img_links.append(link)
	img_counter += 1
	old_links[link]+=1;
	else:
	print "Duplicate Link!!"
	img_counter += 1
	continue
	except RuntimeError:
	print 'Timeout!!!!'
	img_counter += 1
	continue


	# Save img_links to a file
	f = open(img_link_file_path, 'w')
	for img_link in img_links:
	if img_link != None:
	f.write("%s\n" % iriToUri(img_link))
	f.close()

	# Save bad links
	f = open(output_img_paths+query+'_bad_inks.txt', 'w')
	for img_link in bad_img_links:
	if img_link != None:
	f.write("%s\n" % iriToUri(img_link))
	f.close()


	# slave code for multi-rocessing
	def get_link(args):
	# url = "http://m.bing.com/images/more?q="+args[0]+"&ii="+str(args[1])+"&dv=True&form=IGSIGS&IIG=c2a0b6a0c2ab4b179a7c565fa914d169&kval=3.1&AppNs=mSERP"
	url = "http://m.bing.com/images/q="+args[0]+"&ii="+str(args[1])+"&dv=True"
	print url
	soup = get_soup(url)
	if soup != None:
	# if no linke retrieved break iteration
	matches = [a['href'] for a in soup.find_all("a", {"href":re.compile("http:")})]
	if len(matches) > 0:
	img_link = matches[0]
	img_link = img_link.split('?')[0]
	return img_link
	else:
	return None # link has some error
	else:
	return None # page gives some error


	if __name__ == "__main__":
	chunk_no = 1;

	output_path = "/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_images/"
	output_img_paths = '/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/bing_image_urls/'
	# folders = os.listdir('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/pubfig83/images')
	# names = [folder.replace('_', ' ') for folder in folders];
	f = open('/home/retina19/mnt/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/FANLARGE/all_names.txt','r')
	names = f.readlines()
	# name_chunks = chunk(names,5);
	# names = name_chunks[chunk_no-1]


	# f = open('/media/retina18/66f4f5c6-ed98-470a-978f-f667aed46a88/FACE_PROJECT_DATA/Dataset_matlab/all_names.txt','r')
	# names = f.readlines()
	LAST_NAME = 'nikolay davydenko'
	cont_flag = False
	for name in names:
	name = name.strip()
	# folder_name = name.replace(" ","_")
	if cont_flag:
	gather_img_files(name, output_path, output_img_paths, 800)
	if name == LAST_NAME:
	cont_flag = True

	for name in names:
	name = name.strip()
	full_run(name, output_path, output_img_paths, 800)

	# full_run('tiger')