Created
May 21, 2024 17:26
-
-
Save mwender/4000b7412e2eb25032db9d18293e87e7 to your computer and use it in GitHub Desktop.
[SquareSpace Image Downloader] This python script downloads all images found in a SquareSpace site XML export. #python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import shutil | |
import xml.etree.ElementTree as ET | |
from bs4 import BeautifulSoup | |
import logging | |
# Setup basic logging configuration | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Parse the XML and handle namespaces | |
xml_file = 'example.xml' | |
tree = ET.parse(xml_file) | |
root = tree.getroot() | |
namespaces = {'content': 'http://purl.org/rss/1.0/modules/content/'} # Define your namespaces | |
images = set() | |
for item in root.findall('.//item'): | |
content = item.find('content:encoded', namespaces) | |
if content is not None and content.text is not None: # Check that content and content.text are not None | |
soup = BeautifulSoup(content.text, 'html.parser') | |
for img_tag in soup.find_all('img'): | |
img_url = img_tag.get('src') | |
if img_url and any(ext in img_url for ext in ['.png', '.jpg', '.gif']): | |
images.add(img_url) | |
else: | |
logging.warning('No content found in an item or content is empty.') | |
if not images: | |
logging.warning('No images found to download.') | |
else: | |
logging.info(f'Found {len(images)} images to download.') | |
for img in images: | |
try: | |
logging.info(f'Downloading image from {img}') | |
resp = requests.get(img, stream=True) | |
if resp.status_code == 200: | |
with open(f'images/{img.split("/")[-1]}', 'wb') as local_file: | |
resp.raw.decode_content = True | |
shutil.copyfileobj(resp.raw, local_file) | |
logging.info(f'Successfully downloaded {img}') | |
else: | |
logging.error(f'Failed to download {img}. Status code: {resp.status_code}') | |
except Exception as e: | |
logging.error(f'Error downloading {img}. Error: {e}') | |
finally: | |
if 'resp' in locals(): # Ensure resp is defined before attempting to close it | |
resp.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment