cblte/maurelia.py

## maurelia.py
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, urlparse
from tqdm import tqdm


# Create a folder to save images
os.makedirs('maurelia_images', exist_ok=True)


def download_image(url, folder):
    """
    The function `download_image` downloads an image from a given URL and saves it to a specified
    folder.

    :param url: The `url` parameter in the `download_image` function is the URL from which you want to
    download an image. It should be a valid URL pointing to an image file that you want to save locally
    :param folder: The `folder` parameter in the `download_image` function is the directory path where
    you want to save the downloaded image. It should be a string representing the folder path on your
    file system where the image will be stored. For example, it could be something like
    "/path/to/save/directory"
    """
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the URL to extract the image name
        parsed_url = urlparse(url)
        image_name = os.path.basename(parsed_url.path)

        # Save the image
        with open(os.path.join(folder, image_name), 'wb') as f:
            f.write(response.content)


# List of pages to scrape
pages = [
    "https://maurelia.de/Shop/lasercut/",
    "https://maurelia.de/Shop/kerzensticker/",
    "https://maurelia.de/Shop/plottandstamp/",
    "https://maurelia.de/Shop/plottdesign-plotterdateien-kinderkleidung/",
    "https://maurelia.de/Shop/wundertuete/",
    "https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/1",
    "https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/2",
    "https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/3",
    "https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/4"

]

item_links = []


# Scrape each page
# This code snippet is responsible for parsing multiple pages to extract URLs of item links. Here's a
# breakdown of what it does:
print("Parsing pages for URLS...")
for page in tqdm(pages, desc="Pages"):
    response = requests.get(page)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all item links on the page
    links = [a['href'] for a in soup.select('a.woocommerce-LoopProduct-link.woocommerce-loop-product__link')]
    item_links.extend(links)


print("Downloading images...")

for link in tqdm(item_links, desc="Images"):
    item_response = requests.get(link)
    item_soup = BeautifulSoup(item_response.content, 'html.parser')

    # Find the image with the class "wp-post-image" on the item detail page
    image_tag = item_soup.find('img', class_="wp-post-image")
    if image_tag:
        # Extract the value of the attribute 'data-large_image'
        img_url = image_tag.get('data-large_image')

        if img_url:
            # Ensure the URL is absolute; if it starts with "//", add "https:" to make it absolute
            if img_url.startswith("//"):
                img_url = "https:" + img_url
            else:
                img_url = urljoin(link, img_url)
            # The `download_image(img_url, 'maurelia_images')` function call is downloading an image
            # from the provided `img_url` and saving it to the 'maurelia_images' folder on your local
            # machine.
            download_image(img_url, 'maurelia_images')
	import requests
	from bs4 import BeautifulSoup
	import os
	from urllib.parse import urljoin, urlparse
	from tqdm import tqdm



	# Create a folder to save images
	os.makedirs('maurelia_images', exist_ok=True)


	def download_image(url, folder):
	"""
	The function `download_image` downloads an image from a given URL and saves it to a specified
	folder.

	:param url: The `url` parameter in the `download_image` function is the URL from which you want to
	download an image. It should be a valid URL pointing to an image file that you want to save locally
	:param folder: The `folder` parameter in the `download_image` function is the directory path where
	you want to save the downloaded image. It should be a string representing the folder path on your
	file system where the image will be stored. For example, it could be something like
	"/path/to/save/directory"
	"""
	response = requests.get(url)
	if response.status_code == 200:
	# Parse the URL to extract the image name
	parsed_url = urlparse(url)
	image_name = os.path.basename(parsed_url.path)

	# Save the image
	with open(os.path.join(folder, image_name), 'wb') as f:
	f.write(response.content)


	# List of pages to scrape
	pages = [
	"https://maurelia.de/Shop/lasercut/",
	"https://maurelia.de/Shop/kerzensticker/",
	"https://maurelia.de/Shop/plottandstamp/",
	"https://maurelia.de/Shop/plottdesign-plotterdateien-kinderkleidung/",
	"https://maurelia.de/Shop/wundertuete/",
	"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/1",
	"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/2",
	"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/3",
	"https://maurelia.de/Shop/digistamps-maurelia-illustrationen/page/4"

	]

	item_links = []


	# Scrape each page
	# This code snippet is responsible for parsing multiple pages to extract URLs of item links. Here's a
	# breakdown of what it does:
	print("Parsing pages for URLS...")
	for page in tqdm(pages, desc="Pages"):
	response = requests.get(page)
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find all item links on the page
	links = [a['href'] for a in soup.select('a.woocommerce-LoopProduct-link.woocommerce-loop-product__link')]
	item_links.extend(links)


	print("Downloading images...")

	for link in tqdm(item_links, desc="Images"):
	item_response = requests.get(link)
	item_soup = BeautifulSoup(item_response.content, 'html.parser')

	# Find the image with the class "wp-post-image" on the item detail page
	image_tag = item_soup.find('img', class_="wp-post-image")
	if image_tag:
	# Extract the value of the attribute 'data-large_image'
	img_url = image_tag.get('data-large_image')

	if img_url:
	# Ensure the URL is absolute; if it starts with "//", add "https:" to make it absolute
	if img_url.startswith("//"):
	img_url = "https:" + img_url
	else:
	img_url = urljoin(link, img_url)
	# The `download_image(img_url, 'maurelia_images')` function call is downloading an image
	# from the provided `img_url` and saving it to the 'maurelia_images' folder on your local
	# machine.
	download_image(img_url, 'maurelia_images')