Skip to content

Instantly share code, notes, and snippets.

@RichardNesbitt
Created April 5, 2024 00:25
Show Gist options
  • Save RichardNesbitt/4e07b87d69a8db8a24a8e52924e9e4dc to your computer and use it in GitHub Desktop.
Save RichardNesbitt/4e07b87d69a8db8a24a8e52924e9e4dc to your computer and use it in GitHub Desktop.
Get urls of images on sub-pages using Python
#This script still contains all of the print() outputs that I used while refining and debugging it.
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def find_products(url):
print(f"Fetching content from URL: {url}")
# Set up Selenium WebDriver
options = Options()
options.add_argument('--headless')
service = Service('/usr/local/bin/chromedriver') # Provide path to chromedriver executable, this is the path on Mac.
driver = webdriver.Chrome(service=service, options=options)
try:
# Load the URL
driver.get(url)
print("Page loaded successfully.")
# Wait for dynamically loaded content to appear (adjust timeout as needed)
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, 'img')))
print("Dynamically loaded content appeared.")
# Get the HTML content after all dynamic content is loaded
html_content = driver.page_source
soup = BeautifulSoup(html_content, 'html.parser')
# Find all links containing 'products' in the path
# The links I needed to target on the given 'url' contained 'products' in their path.
product_links = [link.get('href') for link in soup.find_all('a', href=lambda href: href and 'products' in href)]
if product_links:
print(f"Found {len(product_links)} product links.")
else:
print("No product links found.")
# Iterate over each product link
for product_link in product_links:
print(f"Fetching content from product URL: {product_link}")
# Load the product page
driver.get(urljoin(url, product_link))
print("Product page loaded successfully.")
# Wait for dynamically loaded content to appear (adjust timeout as needed)
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'figure.wp-block-gallery figure a')))
print("Dynamically loaded content on product page appeared.")
# Wait for the first link matching the selector to become clickable
# I had to wait here because the images galleries are loaded with JS
# Clicking one of the images opens the gallery and puts 'ggbl_slider' in the DOM
link_element = WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'figure.wp-block-gallery figure a')))
link_element.click()
print("Clicked on the first link matching the selector.")
# Wait for dynamically loaded content to appear in ggbl_slider (adjust timeout as needed)
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.ID, 'ggbl_slider')))
print("Dynamically loaded content in ggbl_slider appeared.")
# Get the HTML content after all dynamic content is loaded on the product page
product_html_content = driver.page_source
product_soup = BeautifulSoup(product_html_content, 'html.parser')
# Find the <ul> element with ID "ggbl_slider"
ul_element = product_soup.find('ul', id='ggbl_slider')
# If <ul> element with ID "ggbl_slider" exists
if ul_element:
print("Found <ul> element with ID 'ggbl_slider'.")
# Find all <li> elements inside the <ul>
li_elements = ul_element.find_all('li')
# Iterate over each <li> element
for li in li_elements:
# Find the <img> tag inside the <li>
img_tag = li.find('img')
# If <img> tag exists, print the src attribute
if img_tag:
print(f"Image src: {img_tag.get('src')}")
else:
print("No <img> tag found.")
else:
print("No <ul> element with ID 'ggbl_slider' found.")
finally:
# Close the WebDriver
driver.quit()
# Example usage:
url = 'https://the-starting-page.com'
find_products(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment