Skip to content

Instantly share code, notes, and snippets.

@InputBlackBoxOutput
Created February 26, 2023 19:49
Show Gist options
  • Save InputBlackBoxOutput/1f63fc2bbc1139bb25bf635d2ca0bed5 to your computer and use it in GitHub Desktop.
Save InputBlackBoxOutput/1f63fc2bbc1139bb25bf635d2ca0bed5 to your computer and use it in GitHub Desktop.
Scrape images from a website
import os
import shutil
import requests
import time
import random
# pip install requests-html
from requests_html import HTMLSession
session = HTMLSession()
def scrape_images(keyword, n_pages=3):
# Create an output directory
os.mkdir(f"output/{keyword}")
# Get the page and render the content
for page in range(n_pages):
count = 0
url = f"http://clipart-library.com/search1/?q={keyword}#gsc.tab=1&gsc.q={keyword}&gsc.page={page}"
print(url)
r = session.get(url)
r.html.render()
time.sleep(random.randint(2, 7))
# Extract src attributes from img tags
img_list = r.html.find("img")
src_list = []
for each_img in img_list:
try:
src_list += [each_img.attrs['src']]
except:
pass
# Collect image data and store in a file
for each_src in list(set(src_list)):
if '..' not in each_src:
print(each_src)
response = requests.get(each_src, stream=True)
with open(f"output/{keyword}/{page}-{count+1}.png", 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
del response
count += 1
if __name__ == '__main__':
scrape_images(keyword="apple")
# with open("keywords.lst") as keyword_file:
# keywords = keyword_file.read().splitlines()
# print(len(keywords))
# for keyword in keywords:
# print(keyword)
# scrape_images(keyword)
# time.sleep(random.randint(2,7))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment