Skip to content

Instantly share code, notes, and snippets.

@iamdual
Last active November 17, 2018 00:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iamdual/c77c332d28e864c43cf68e3b3cac85ae to your computer and use it in GitHub Desktop.
Save iamdual/c77c332d28e864c43cf68e3b3cac85ae to your computer and use it in GitHub Desktop.
Entire Site Archiver By sitemap.xml
#!/usr/bin/python
import sys, requests
from xml.dom.minidom import parseString
from user_agent import generate_user_agent
if len(sys.argv) <= 1:
print("Usage: ./archiver.py <sitemap_url>")
sys.exit(0)
def save_page(page_url):
headers = {"User-Agent": generate_user_agent(os=("mac", "linux")), "Referer": "http://web.archive.org"}
try:
r = requests.get("http://web.archive.org/save/" + page_url, headers=headers)
print("Saving: " + page_url)
return True
except requests.ConnectionError as e:
print("Retry: " + page_url)
return False
sitemap_url = str(sys.argv[1]).strip()
sitemap_src = requests.get(sitemap_url)
dom = parseString(sitemap_src.text)
locs = dom.getElementsByTagName("loc")
for loc in locs:
page_url = loc.firstChild.nodeValue.strip()
while True:
status = save_page(page_url)
if status == True:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment