Skip to content

Instantly share code, notes, and snippets.

@dcinzona
Last active November 21, 2022 14:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dcinzona/e3b75f7110e777d603fa0e20f4e363d2 to your computer and use it in GitHub Desktop.
Save dcinzona/e3b75f7110e777d603fa0e20f4e363d2 to your computer and use it in GitHub Desktop.
A quick and dirty python script I used to pull down all my old images from the web.archive.org cache of my ghost blog after losing them during a server migration...
import re
import requests
from bs4 import BeautifulSoup
import os
SITE = "https://tandeciarz.com"
# this will end up being the archive URL
AURL = SITE
# timestamp for date before the snapshot I want to scrape
ts = "20221010"
# not really needed but doing this for self-reference later.
# API service to verify whether a snapshot exists and grab the snapshot URL
avail = f"https://archive.org/wayback/available?url={SITE}&timestamp={ts}"
# pretend I'm not a script
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
snapshots = requests.get(url=avail,headers=headers).json()
# all pages hosted start with this url
root = "http://web.archive.org"
# used to split prefix since it will include the timestamp
PREFIX = "/web/"
if(snapshots['archived_snapshots']['closest']):
snap = snapshots['archived_snapshots']['closest']
AURL=snap['url']
PREFIX = AURL.replace(root,"").replace(SITE,"").replace("//","/")
# I was going to use this to maintain a unique set of all the links on my site (pointing to a page on my site) for processing later
# but I ended up just using this as my target starting point
links = {AURL}
# The URLs of the images I'm trying to scrape
images = set()
# Whether the URL (blog post or page) was already retrieved via the script (so we don't go into a scrape loop)
retrieved = {}
# function to extract html document from given url
def getHTMLdocument(url):
if(url in retrieved):
return retrieved[url]
# request for HTML document of given url
response = requests.get(url=url, headers=headers)
retrieved[url] = response.content
# response will be provided in JSON format
return response.content
# ends up looking like https://web.archive.org/web/20221015071024/https://tandeciarz.com/
TRUNC = PREFIX + SITE
def processContent(soup):
# find all <img> tags and store the src url
for img in soup.find_all('img'):
src = img.get('src')
if(src is None):
continue
if "/https://tandeciarz.com/content/images/" in img.get('src'):
if src.startswith("/"):
src = root + src
if src not in images:
images.add(src)
# find all of the tags that have the style attribute with a background-image set (this is how ghost renders out blog post images)\
# this was arguably the hardest part. Github Co-Pilot effectively wrote this block of code for me :)
for bgimg in soup.find_all(style=re.compile("background-image")):
src = bgimg.get('style')
if(src is None):
continue
src = src.replace("background-image: url(","").replace(");","")
# sometimes the URL would be the full URL, sometimes it would be the relative URL (not really sure why)
if src.startswith("/"):
src = root + src
if src.endswith(")"):
src = src[:-1]
# honestly, I forgot this was a set
if src not in images:
images.add(src)
# go through all the links on the page and track and scrape any links that go to my site
for link in soup.find_all('a'):
href = link.get('href')
# don't follow anchors
if(href is None):
continue
if "/https://tandeciarz.com/" in href:
if href.startswith("/"):
href = root + href
# I would comment out continue to only pull the first page (to test)
if href.startswith(AURL) and href not in retrieved:
# continue
print("Retrieving: " + href)
processContent(BeautifulSoup(getHTMLdocument(href), 'html5lib'))
print("Retrieving: " + AURL)
processContent(BeautifulSoup(getHTMLdocument(AURL), 'html5lib'))
print("Images:")
uniqueImages = {}
# using a dict this time
for i in images:
dirPath = i.split("/https://tandeciarz.com/")[1]
if dirPath not in uniqueImages:
uniqueImages[dirPath] = i
# get the URL and strip out everything before the /content so I can build out the folder structure locally and save the image
for i in uniqueImages:
if(os.path.exists(i)):
print("Skipping: " + i)
continue
dirpath = i.split("/")
dirpath = "/".join(dirpath[:-1])
if not os.path.exists(dirpath):
os.makedirs(dirpath, exist_ok=True)
imgfile = i.split("/")[-1]
print("Downloading: " + imgfile)
# download the image and save it to the same folder structure that's on my server, so I can easily copy them over when I'm done
with open(i, 'wb') as handle:
response = requests.get(uniqueImages[i], stream=True)
if not response.ok:
print(response)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment