Last active
June 29, 2021 08:36
-
-
Save Ed-Optalysys/232d0fa6312e2ac8e9ef031ed7b848ad to your computer and use it in GitHub Desktop.
Python & selenium image web scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from PIL import Image | |
import os | |
import time | |
import io | |
import requests | |
import hashlib | |
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): | |
def scroll_to_end(wd): | |
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(sleep_between_interactions) | |
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" | |
wd.get(search_url.format(q=query)) | |
image_urls = set() | |
image_count = 0 | |
results_start = 0 | |
load_mode_iterations = 0 | |
while image_count < max_links_to_fetch: | |
scroll_to_end(wd) | |
thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") | |
number_results = len(thumbnail_results) | |
print( | |
f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}") | |
for img in thumbnail_results[results_start:number_results]: | |
try: | |
img.click() | |
time.sleep(sleep_between_interactions) | |
except Exception: | |
continue | |
actual_images = wd.find_elements_by_css_selector('img.n3VNCb') | |
for actual_image in actual_images: | |
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): | |
image_urls.add(actual_image.get_attribute('src')) | |
image_count = len(image_urls) | |
if (len(image_urls) >= max_links_to_fetch): | |
print(f"Found: {len(image_urls)} image links, done!") | |
break | |
else: | |
print("Found:", len(image_urls), | |
"image links, looking for more ...") | |
time.sleep(1) | |
load_mode_iterations += 1 | |
if load_mode_iterations > 15: | |
print( | |
f"Giving up. Found: {len(image_urls)} image links, done!") | |
return image_urls | |
load_more_button = wd.find_element_by_css_selector(".mye4qd") | |
if load_more_button: | |
wd.execute_script("document.querySelector('.mye4qd').click();") | |
results_start = len(thumbnail_results) | |
return image_urls | |
def persist_image(folder_path: str, url: str): | |
try: | |
image_content = requests.get(url, timeout=10).content | |
except Exception as e: | |
print(f"ERROR - Could not download {url} - {e}") | |
try: | |
image_file = io.BytesIO(image_content) | |
image = Image.open(image_file).convert('RGB') | |
_w, _h = image.size | |
min_dim = min(_w, _h) | |
if (min_dim < IMAGE_SIZE): | |
raise RuntimeError('Image too small: ' + str(_w) + ' ' + str(_h)) | |
else: | |
half_min_dim = int(min_dim / 2) | |
centre_h = int(_h / 2) | |
centre_w = int(_w / 2) | |
start_h = centre_h - half_min_dim | |
start_w = centre_w - half_min_dim | |
end_h = start_h + min_dim | |
end_w = start_w + min_dim | |
image = image.crop((start_w, start_h, end_w, end_h)) | |
image = image.resize((IMAGE_SIZE, IMAGE_SIZE)) | |
file_path = os.path.join(folder_path, hashlib.sha1( | |
image_content).hexdigest()[:10] + '.jpg') | |
with open(file_path, 'wb') as f: | |
image.save(f, "JPEG") | |
print(f"SUCCESS - Saved image from {url}") | |
except Exception as e: | |
print(f"ERROR - Could not save {url} - {e}") | |
def search_and_download(search_term: str, driver_path: str, target_path='images', number_images=5): | |
target_folder = os.path.join(target_path, "_".join(search_term.split())) | |
if not os.path.exists(target_folder): | |
os.makedirs(target_folder) | |
with webdriver.Chrome(executable_path=driver_path) as wd: | |
res = fetch_image_urls(search_term, number_images, | |
wd=wd, sleep_between_interactions=0.1) | |
for elem in res: | |
persist_image(target_folder, elem) | |
DRIVER_PATH = 'scraping/chromedriver.exe' | |
IMAGE_SIZE = 256 | |
if __name__ == '__main__': | |
search_and_download('citrus fruit', DRIVER_PATH, number_images=1000) | |
search_and_download('leaves', DRIVER_PATH, number_images=1000) | |
search_and_download('rainforest', DRIVER_PATH, number_images=1000) | |
search_and_download('cuttlefish', DRIVER_PATH, number_images=1000) | |
search_and_download('lion', DRIVER_PATH, number_images=1000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment