InputBlackBoxOutput/scraper.py

## scraper.py
import os
import shutil
import requests
import time
import random

# pip install requests-html
from requests_html import HTMLSession
session = HTMLSession()


def scrape_images(keyword, n_pages=3):

	# Create an output directory
	os.mkdir(f"output/{keyword}")

	# Get the page and render the content
	for page in range(n_pages):
		count = 0

		url = f"http://clipart-library.com/search1/?q={keyword}#gsc.tab=1&gsc.q={keyword}&gsc.page={page}"
		print(url)

		r = session.get(url)
		r.html.render()
		time.sleep(random.randint(2, 7))

		# Extract src attributes from img tags
		img_list = r.html.find("img")
		src_list = []
		for each_img in img_list:
			try:
				src_list += [each_img.attrs['src']]
			except:
				pass

		# Collect image data and store in a file
		for each_src in list(set(src_list)):
			if '..' not in each_src:
				print(each_src)

				response = requests.get(each_src, stream=True)
				with open(f"output/{keyword}/{page}-{count+1}.png", 'wb') as out_file:
				    shutil.copyfileobj(response.raw, out_file)

				del response
				count += 1


if __name__ == '__main__':
	scrape_images(keyword="apple")

	# with open("keywords.lst") as keyword_file:
	# 	keywords = keyword_file.read().splitlines()
	# 	print(len(keywords))

	# 	for keyword in keywords:
	# 		print(keyword)
	# 		scrape_images(keyword)

	# 		time.sleep(random.randint(2,7))
	import os
	import shutil
	import requests
	import time
	import random

	# pip install requests-html
	from requests_html import HTMLSession
	session = HTMLSession()


	def scrape_images(keyword, n_pages=3):

	# Create an output directory
	os.mkdir(f"output/{keyword}")

	# Get the page and render the content
	for page in range(n_pages):
	count = 0

	url = f"http://clipart-library.com/search1/?q={keyword}#gsc.tab=1&gsc.q={keyword}&gsc.page={page}"
	print(url)

	r = session.get(url)
	r.html.render()
	time.sleep(random.randint(2, 7))

	# Extract src attributes from img tags
	img_list = r.html.find("img")
	src_list = []
	for each_img in img_list:
	try:
	src_list += [each_img.attrs['src']]
	except:
	pass

	# Collect image data and store in a file
	for each_src in list(set(src_list)):
	if '..' not in each_src:
	print(each_src)

	response = requests.get(each_src, stream=True)
	with open(f"output/{keyword}/{page}-{count+1}.png", 'wb') as out_file:
	shutil.copyfileobj(response.raw, out_file)

	del response
	count += 1


	if __name__ == '__main__':
	scrape_images(keyword="apple")

	# with open("keywords.lst") as keyword_file:
	# keywords = keyword_file.read().splitlines()
	# print(len(keywords))

	# for keyword in keywords:
	# print(keyword)
	# scrape_images(keyword)

	# time.sleep(random.randint(2,7))