Skip to content

Instantly share code, notes, and snippets.

@cblte
Created June 30, 2024 10:00
Show Gist options
  • Save cblte/a26aefa609095c3bca34a7d0b9ca9b30 to your computer and use it in GitHub Desktop.
Save cblte/a26aefa609095c3bca34a7d0b9ca9b30 to your computer and use it in GitHub Desktop.
Download all images from the soon to close webshop. With the images you can then redraw the things you want for plotting
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
# Create a folder to save images
os.makedirs('maurelia_images', exist_ok=True)
def download_image(url, folder):
"""
The function `download_image` downloads an image from a given URL and saves it to a specified
folder.
:param url: The `url` parameter in the `download_image` function is the URL from which you want to
download an image. It should be a valid URL pointing to an image file that you want to save locally
:param folder: The `folder` parameter in the `download_image` function is the directory path where
you want to save the downloaded image. It should be a string representing the folder path on your
file system where the image will be stored. For example, it could be something like
"/path/to/save/directory"
"""
response = requests.get(url)
if response.status_code == 200:
# Parse the URL to extract the image name
parsed_url = urlparse(url)
image_name = os.path.basename(parsed_url.path)
# Save the image
with open(os.path.join(folder, image_name), 'wb') as f:
f.write(response.content)
# List of pages to scrape
pages = [
"https://maurelia.de/Shop/lasercut/",
"https://maurelia.de/Shop/kerzensticker/",
"https://maurelia.de/Shop/plottandstamp/",
"https://maurelia.de/Shop/plottdesign-plotterdateien-kinderkleidung/",
"https://maurelia.de/Shop/wundertuete/",
"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/1",
"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/2",
"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/3",
"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/4"
]
item_links = []
# Scrape each page
# This code snippet is responsible for parsing multiple pages to extract URLs of item links. Here's a
# breakdown of what it does:
print("Parsing pages for URLS...")
for page in tqdm(pages, desc="Pages"):
response = requests.get(page)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all item links on the page
links = [a['href'] for a in soup.select('a.woocommerce-LoopProduct-link.woocommerce-loop-product__link')]
item_links.extend(links)
print("Downloading images...")
for link in tqdm(item_links, desc="Images"):
item_response = requests.get(link)
item_soup = BeautifulSoup(item_response.content, 'html.parser')
# Find the image with the class "wp-post-image" on the item detail page
image_tag = item_soup.find('img', class_="wp-post-image")
if image_tag:
# Extract the value of the attribute 'data-large_image'
img_url = image_tag.get('data-large_image')
if img_url:
# Ensure the URL is absolute; if it starts with "//", add "https:" to make it absolute
if img_url.startswith("//"):
img_url = "https:" + img_url
else:
img_url = urljoin(link, img_url)
# The `download_image(img_url, 'maurelia_images')` function call is downloading an image
# from the provided `img_url` and saving it to the 'maurelia_images' folder on your local
# machine.
download_image(img_url, 'maurelia_images')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment