dcinzona/scrape_wayback.py

## scrape_wayback.py
import re
import requests
from bs4 import BeautifulSoup
import os

SITE = "https://tandeciarz.com"
# this will end up being the archive URL
AURL = SITE
# timestamp for date before the snapshot I want to scrape
ts = "20221010"
# not really needed but doing this for self-reference later.
# API service to verify whether a snapshot exists and grab the snapshot URL
avail = f"https://archive.org/wayback/available?url={SITE}&timestamp={ts}"
# pretend I'm not a script
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
snapshots = requests.get(url=avail,headers=headers).json()
# all pages hosted start with this url
root = "http://web.archive.org"
# used to split prefix since it will include the timestamp
PREFIX = "/web/"
if(snapshots['archived_snapshots']['closest']):
    snap = snapshots['archived_snapshots']['closest']
    AURL=snap['url']
    PREFIX = AURL.replace(root,"").replace(SITE,"").replace("//","/")
# I was going to use this to maintain a unique set of all the links on my site (pointing to a page on my site) for processing later
# but I ended up just using this as my target starting point
links = {AURL}
# The URLs of the images I'm trying to scrape
images = set()
# Whether the URL (blog post or page) was already retrieved via the script (so we don't go into a scrape loop)
retrieved = {}
# function to extract html document from given url
def getHTMLdocument(url):
    if(url in retrieved):
        return retrieved[url]
    # request for HTML document of given url
    response = requests.get(url=url, headers=headers)
    retrieved[url] = response.content
    # response will be provided in JSON format
    return response.content

# ends up looking like https://web.archive.org/web/20221015071024/https://tandeciarz.com/
TRUNC = PREFIX + SITE

def processContent(soup):
    # find all <img> tags and store the src url
    for img in soup.find_all('img'):
        src = img.get('src')
        if(src is None):
            continue

        if "/https://tandeciarz.com/content/images/" in img.get('src'):
            if src.startswith("/"):
                src = root + src
            if src not in images:
                images.add(src)
    # find all of the tags that have the style attribute with a background-image set (this is how ghost renders out blog post images)\
    # this was arguably the hardest part.  Github Co-Pilot effectively wrote this block of code for me :)
    for bgimg in soup.find_all(style=re.compile("background-image")):
        src = bgimg.get('style')
        if(src is None):
            continue
        src = src.replace("background-image: url(","").replace(");","")
        # sometimes the URL would be the full URL, sometimes it would be the relative URL (not really sure why)
        if src.startswith("/"):
            src = root + src
        if src.endswith(")"):
            src = src[:-1]
        # honestly, I forgot this was a set
        if src not in images:
            images.add(src)

    # go through all the links on the page and track and scrape any links that go to my site
    for link in soup.find_all('a'):
        href = link.get('href')
        # don't follow anchors
        if(href is None):
            continue

        if "/https://tandeciarz.com/" in href:
            if href.startswith("/"):
                href = root + href

        # I would comment out continue to only pull the first page (to test)
        if href.startswith(AURL) and href not in retrieved:
            # continue
            print("Retrieving: " + href)
            processContent(BeautifulSoup(getHTMLdocument(href), 'html5lib'))


print("Retrieving: " + AURL)
processContent(BeautifulSoup(getHTMLdocument(AURL), 'html5lib'))

print("Images:")
uniqueImages = {}

# using a dict this time
for i in images:
    dirPath = i.split("/https://tandeciarz.com/")[1]
    if dirPath not in uniqueImages:
        uniqueImages[dirPath] = i

# get the URL and strip out everything before the /content so I can build out the folder structure locally and save the image
for i in uniqueImages:
    if(os.path.exists(i)):
        print("Skipping: " + i)
        continue
    dirpath = i.split("/")
    dirpath = "/".join(dirpath[:-1])
    if not os.path.exists(dirpath):
        os.makedirs(dirpath, exist_ok=True)
    imgfile = i.split("/")[-1]
    print("Downloading: " + imgfile)

    # download the image and save it to the same folder structure that's on my server, so I can easily copy them over when I'm done
    with open(i, 'wb') as handle:
        response = requests.get(uniqueImages[i], stream=True)
        if not response.ok:
            print(response)
        for block in response.iter_content(1024):
            if not block:
                break
            handle.write(block)
	import re
	import requests
	from bs4 import BeautifulSoup
	import os

	SITE = "https://tandeciarz.com"
	# this will end up being the archive URL
	AURL = SITE
	# timestamp for date before the snapshot I want to scrape
	ts = "20221010"
	# not really needed but doing this for self-reference later.
	# API service to verify whether a snapshot exists and grab the snapshot URL
	avail = f"https://archive.org/wayback/available?url={SITE}&timestamp={ts}"
	# pretend I'm not a script
	headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
	snapshots = requests.get(url=avail,headers=headers).json()
	# all pages hosted start with this url
	root = "http://web.archive.org"
	# used to split prefix since it will include the timestamp
	PREFIX = "/web/"
	if(snapshots['archived_snapshots']['closest']):
	snap = snapshots['archived_snapshots']['closest']
	AURL=snap['url']
	PREFIX = AURL.replace(root,"").replace(SITE,"").replace("//","/")
	# I was going to use this to maintain a unique set of all the links on my site (pointing to a page on my site) for processing later
	# but I ended up just using this as my target starting point
	links = {AURL}
	# The URLs of the images I'm trying to scrape
	images = set()
	# Whether the URL (blog post or page) was already retrieved via the script (so we don't go into a scrape loop)
	retrieved = {}
	# function to extract html document from given url
	def getHTMLdocument(url):
	if(url in retrieved):
	return retrieved[url]
	# request for HTML document of given url
	response = requests.get(url=url, headers=headers)
	retrieved[url] = response.content
	# response will be provided in JSON format
	return response.content

	# ends up looking like https://web.archive.org/web/20221015071024/https://tandeciarz.com/
	TRUNC = PREFIX + SITE

	def processContent(soup):
	# find all <img> tags and store the src url
	for img in soup.find_all('img'):
	src = img.get('src')
	if(src is None):
	continue

	if "/https://tandeciarz.com/content/images/" in img.get('src'):
	if src.startswith("/"):
	src = root + src
	if src not in images:
	images.add(src)
	# find all of the tags that have the style attribute with a background-image set (this is how ghost renders out blog post images)\
	# this was arguably the hardest part. Github Co-Pilot effectively wrote this block of code for me :)
	for bgimg in soup.find_all(style=re.compile("background-image")):
	src = bgimg.get('style')
	if(src is None):
	continue
	src = src.replace("background-image: url(","").replace(");","")
	# sometimes the URL would be the full URL, sometimes it would be the relative URL (not really sure why)
	if src.startswith("/"):
	src = root + src
	if src.endswith(")"):
	src = src[:-1]
	# honestly, I forgot this was a set
	if src not in images:
	images.add(src)

	# go through all the links on the page and track and scrape any links that go to my site
	for link in soup.find_all('a'):
	href = link.get('href')
	# don't follow anchors
	if(href is None):
	continue

	if "/https://tandeciarz.com/" in href:
	if href.startswith("/"):
	href = root + href

	# I would comment out continue to only pull the first page (to test)
	if href.startswith(AURL) and href not in retrieved:
	# continue
	print("Retrieving: " + href)
	processContent(BeautifulSoup(getHTMLdocument(href), 'html5lib'))



	print("Retrieving: " + AURL)
	processContent(BeautifulSoup(getHTMLdocument(AURL), 'html5lib'))

	print("Images:")
	uniqueImages = {}

	# using a dict this time
	for i in images:
	dirPath = i.split("/https://tandeciarz.com/")[1]
	if dirPath not in uniqueImages:
	uniqueImages[dirPath] = i

	# get the URL and strip out everything before the /content so I can build out the folder structure locally and save the image
	for i in uniqueImages:
	if(os.path.exists(i)):
	print("Skipping: " + i)
	continue
	dirpath = i.split("/")
	dirpath = "/".join(dirpath[:-1])
	if not os.path.exists(dirpath):
	os.makedirs(dirpath, exist_ok=True)
	imgfile = i.split("/")[-1]
	print("Downloading: " + imgfile)

	# download the image and save it to the same folder structure that's on my server, so I can easily copy them over when I'm done
	with open(i, 'wb') as handle:
	response = requests.get(uniqueImages[i], stream=True)
	if not response.ok:
	print(response)
	for block in response.iter_content(1024):
	if not block:
	break
	handle.write(block)