Created
June 30, 2024 10:00
-
-
Save cblte/a26aefa609095c3bca34a7d0b9ca9b30 to your computer and use it in GitHub Desktop.
Download all images from the soon to close webshop. With the images you can then redraw the things you want for plotting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os | |
from urllib.parse import urljoin, urlparse | |
from tqdm import tqdm | |
# Create a folder to save images | |
os.makedirs('maurelia_images', exist_ok=True) | |
def download_image(url, folder): | |
""" | |
The function `download_image` downloads an image from a given URL and saves it to a specified | |
folder. | |
:param url: The `url` parameter in the `download_image` function is the URL from which you want to | |
download an image. It should be a valid URL pointing to an image file that you want to save locally | |
:param folder: The `folder` parameter in the `download_image` function is the directory path where | |
you want to save the downloaded image. It should be a string representing the folder path on your | |
file system where the image will be stored. For example, it could be something like | |
"/path/to/save/directory" | |
""" | |
response = requests.get(url) | |
if response.status_code == 200: | |
# Parse the URL to extract the image name | |
parsed_url = urlparse(url) | |
image_name = os.path.basename(parsed_url.path) | |
# Save the image | |
with open(os.path.join(folder, image_name), 'wb') as f: | |
f.write(response.content) | |
# List of pages to scrape | |
pages = [ | |
"https://maurelia.de/Shop/lasercut/", | |
"https://maurelia.de/Shop/kerzensticker/", | |
"https://maurelia.de/Shop/plottandstamp/", | |
"https://maurelia.de/Shop/plottdesign-plotterdateien-kinderkleidung/", | |
"https://maurelia.de/Shop/wundertuete/", | |
"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/1", | |
"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/2", | |
"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/3", | |
"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/4" | |
] | |
item_links = [] | |
# Scrape each page | |
# This code snippet is responsible for parsing multiple pages to extract URLs of item links. Here's a | |
# breakdown of what it does: | |
print("Parsing pages for URLS...") | |
for page in tqdm(pages, desc="Pages"): | |
response = requests.get(page) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find all item links on the page | |
links = [a['href'] for a in soup.select('a.woocommerce-LoopProduct-link.woocommerce-loop-product__link')] | |
item_links.extend(links) | |
print("Downloading images...") | |
for link in tqdm(item_links, desc="Images"): | |
item_response = requests.get(link) | |
item_soup = BeautifulSoup(item_response.content, 'html.parser') | |
# Find the image with the class "wp-post-image" on the item detail page | |
image_tag = item_soup.find('img', class_="wp-post-image") | |
if image_tag: | |
# Extract the value of the attribute 'data-large_image' | |
img_url = image_tag.get('data-large_image') | |
if img_url: | |
# Ensure the URL is absolute; if it starts with "//", add "https:" to make it absolute | |
if img_url.startswith("//"): | |
img_url = "https:" + img_url | |
else: | |
img_url = urljoin(link, img_url) | |
# The `download_image(img_url, 'maurelia_images')` function call is downloading an image | |
# from the provided `img_url` and saving it to the 'maurelia_images' folder on your local | |
# machine. | |
download_image(img_url, 'maurelia_images') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment