bitsgalore/saveToWayback.py

## saveToWayback.py
#! /usr/bin/env python3
#
"""
Save web pages to Wayback Machine. Argument urlsIn can either be
a text file with URLs (each line contains one URL), or a single
URL. In the first (input file) case it will simply save each URL.
In the latter case (input URL) it will extract all links from the URL, and
save those as well as the root URL (useful for saving a page with all
of its direct references). The optional --extensions argument can be used
to limit this to one or more specific file extensions. E.g. the following
will only save the root URL and any linked PDF and docx resources:

saveToWayback.py --extensions pdf,docx whatever.com/reports/ output.csv

Requirements:

- Waybackpy https://akamhy.github.io/waybackpy/
- beautifulsoup4 https://www.crummy.com/software/BeautifulSoup/
"""

import os
import sys
import csv
import re
import argparse
import requests
import waybackpy
from bs4 import BeautifulSoup

# Create parser
parser = argparse.ArgumentParser(
    description="Save URLs to WayBack")


def parseCommandLine():
    """Parse command line"""
    # Add arguments
    parser.add_argument('urlsIn',
                        action="store",
                        type=str,
                        help="either a file with URLs, or a single  URL")
    parser.add_argument('fileOut',
                        action="store",
                        type=str,
                        help="output file")
    parser.add_argument('--extensions', '-e',
                        type=str,
                        help="comma-separated list of file extensions that control\
                        the linked resources from input URL that are saved",
                        action='store',
                        dest='extString',
                        default="")
    parser.add_argument('--maxtries', '-t',
                        type=int,
                        help="maximum number of tries to save each URL",
                        action='store',
                        dest='maxTries',
                        default=2)
    # Parse arguments
    args = parser.parse_args()

    return args


def errorExit(msg):
    """Print warning to stderr and exit"""
    msgString = ("Error: " + msg + "\n")
    sys.stderr.write(msgString)
    sys.exit()


def isURL(url):
    """
    Check if string is URL
    """

    # Source: https://stackoverflow.com/a/7160778/1209004
    regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

    if re.match(regex, url) is not None:
        result = True
    else:
        result = False

    return result


def saveURL(url):
    """
    Save one URL, return success flag and error message (in case of errors)
    """
    user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
    errorMsg = ""
    wayback = waybackpy.Url(url, user_agent)
    try:
        archive = wayback.save()
        url_archived = archive.archive_url
        success = True
    except waybackpy.exceptions.WaybackError as error:
        success = False
        url_archived = ""
        errorMsg = error
    return(url_archived, success, errorMsg)


def urlsFromFile(inputFile):
    urls = []
    with open(inputFile) as fIn:
        for line in fIn:
            line = line.strip()
            if line != "":
                urls.append(line)
    return list(set(urls))


def urlsFromPage(inputURL, extensions):
    urls = [inputURL]
    reqs = requests.get(inputURL)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    for link in soup.find_all('a'):
        href = link.get('href')
        if href != None:
            #if ".pdf" in href:
            if any(substring in href for substring in extensions):
                urls.append(href)
    return list(set(urls))


def main():

    args = parseCommandLine()
    urlsIn = args.urlsIn
    fileOut = args.fileOut
    extString = args.extString
    extensions = extString.split(",")
    maxTries = args.maxTries

    # Get urls from input page or file
    if isURL(urlsIn):
        urls = urlsFromPage(urlsIn, extensions)
    elif os.path.isfile(urlsIn):
        urls = urlsFromFile(urlsIn)
    else:
        errorExit("urlsIn is neither a file nor a URL")

    # Open output file in write mode
    of = open(fileOut, "w", encoding="utf-8")

   # Create CSV writer object
    csvOut = csv.writer(of, lineterminator='\n')

    # Write header row to output file
    csvOut.writerow(["url", "url_archived", "success", "errorMsg"])


    for url in urls:
        print("Processing URL: " + url)
        success = False
        tries = 0
        while not success and tries < maxTries:
            url_archived, success, errorMsg = saveURL(url)
            tries += 1
        print("Success: " + str(success))
        csvOut.writerow([url, url_archived, str(success), errorMsg])

    of.close()

if __name__ == "__main__":
    main()
	#! /usr/bin/env python3
	#
	"""
	Save web pages to Wayback Machine. Argument urlsIn can either be
	a text file with URLs (each line contains one URL), or a single
	URL. In the first (input file) case it will simply save each URL.
	In the latter case (input URL) it will extract all links from the URL, and
	save those as well as the root URL (useful for saving a page with all
	of its direct references). The optional --extensions argument can be used
	to limit this to one or more specific file extensions. E.g. the following
	will only save the root URL and any linked PDF and docx resources:

	saveToWayback.py --extensions pdf,docx whatever.com/reports/ output.csv

	Requirements:

	- Waybackpy https://akamhy.github.io/waybackpy/
	- beautifulsoup4 https://www.crummy.com/software/BeautifulSoup/
	"""

	import os
	import sys
	import csv
	import re
	import argparse
	import requests
	import waybackpy
	from bs4 import BeautifulSoup

	# Create parser
	parser = argparse.ArgumentParser(
	description="Save URLs to WayBack")


	def parseCommandLine():
	"""Parse command line"""
	# Add arguments
	parser.add_argument('urlsIn',
	action="store",
	type=str,
	help="either a file with URLs, or a single URL")
	parser.add_argument('fileOut',
	action="store",
	type=str,
	help="output file")
	parser.add_argument('--extensions', '-e',
	type=str,
	help="comma-separated list of file extensions that control\
	the linked resources from input URL that are saved",
	action='store',
	dest='extString',
	default="")
	parser.add_argument('--maxtries', '-t',
	type=int,
	help="maximum number of tries to save each URL",
	action='store',
	dest='maxTries',
	default=2)
	# Parse arguments
	args = parser.parse_args()

	return args


	def errorExit(msg):
	"""Print warning to stderr and exit"""
	msgString = ("Error: " + msg + "\n")
	sys.stderr.write(msgString)
	sys.exit()


	def isURL(url):
	"""
	Check if string is URL
	"""

	# Source: https://stackoverflow.com/a/7160778/1209004
	regex = re.compile(
	r'^(?:http\|ftp)s?://' # http:// or https://
	r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?\|[A-Z0-9-]{2,}\.?)\|' #domain...
	r'localhost\|' #localhost...
	r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
	r'(?::\d+)?' # optional port
	r'(?:/?\|[/?]\S+)$', re.IGNORECASE)

	if re.match(regex, url) is not None:
	result = True
	else:
	result = False

	return result


	def saveURL(url):
	"""
	Save one URL, return success flag and error message (in case of errors)
	"""
	user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
	errorMsg = ""
	wayback = waybackpy.Url(url, user_agent)
	try:
	archive = wayback.save()
	url_archived = archive.archive_url
	success = True
	except waybackpy.exceptions.WaybackError as error:
	success = False
	url_archived = ""
	errorMsg = error
	return(url_archived, success, errorMsg)


	def urlsFromFile(inputFile):
	urls = []
	with open(inputFile) as fIn:
	for line in fIn:
	line = line.strip()
	if line != "":
	urls.append(line)
	return list(set(urls))


	def urlsFromPage(inputURL, extensions):
	urls = [inputURL]
	reqs = requests.get(inputURL)
	soup = BeautifulSoup(reqs.text, 'html.parser')
	for link in soup.find_all('a'):
	href = link.get('href')
	if href != None:
	#if ".pdf" in href:
	if any(substring in href for substring in extensions):
	urls.append(href)
	return list(set(urls))


	def main():

	args = parseCommandLine()
	urlsIn = args.urlsIn
	fileOut = args.fileOut
	extString = args.extString
	extensions = extString.split(",")
	maxTries = args.maxTries

	# Get urls from input page or file
	if isURL(urlsIn):
	urls = urlsFromPage(urlsIn, extensions)
	elif os.path.isfile(urlsIn):
	urls = urlsFromFile(urlsIn)
	else:
	errorExit("urlsIn is neither a file nor a URL")

	# Open output file in write mode
	of = open(fileOut, "w", encoding="utf-8")

	# Create CSV writer object
	csvOut = csv.writer(of, lineterminator='\n')

	# Write header row to output file
	csvOut.writerow(["url", "url_archived", "success", "errorMsg"])


	for url in urls:
	print("Processing URL: " + url)
	success = False
	tries = 0
	while not success and tries < maxTries:
	url_archived, success, errorMsg = saveURL(url)
	tries += 1
	print("Success: " + str(success))
	csvOut.writerow([url, url_archived, str(success), errorMsg])

	of.close()

	if __name__ == "__main__":
	main()