yingziwu/save_v2ex_topic_to_internet_archive.py

## save_v2ex_topic_to_internet_archive.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from six.moves.urllib.parse import urljoin

def capture(
    target_url,
    user_agent="savepagenow (https://github.com/pastpages/savepagenow)",
    accept_cache=False,
    proxies=None
):
    """
    Archives the provided URL using archive.org's Wayback Machine.
    Returns the archive.org URL where the capture is stored.
    Raises a CachedPage exception if archive.org declines to conduct a new
    capture and returns a previous snapshot instead.
    To silence that exception, pass into True to the ``accept_cache`` keyword
    argument.
    """
    # Put together the URL that will save our request
    domain = "https://web.archive.org"
    save_url = urljoin(domain, "/save/")
    request_url = save_url + target_url

    # Send the capture request to achive.org
    headers = {
        'User-Agent': user_agent,
    }
    try:
        response = requests.get(request_url, headers=headers,proxies=proxies)
    except requests.exceptions.RequestException as e:
        raise e

    if response.status_code == 403:
        if 'X-Archive-Wayback-Runtime-Error' in response.headers:
            if response.headers['X-Archive-Wayback-Runtime-Error'] == 'RobotAccessControlException: Blocked By Robots':
                raise BlockedByRobots("archive.org returned blocked by robots.txt error")

    # Put together the URL where this page is archived
    archive_id = response.headers['Content-Location']
    archive_url = urljoin(domain, archive_id)

    # Determine if the response was cached
    cached = response.headers['X-Page-Cache'] == 'HIT'
    if cached:
        if not accept_cache:
            raise CachedPage("archive.org returned a cached version of this page: {}".format(
                archive_url
            ))

    # Return that
    return archive_url


def capture_or_cache(
    target_url,
    user_agent="savepagenow (https://github.com/pastpages/savepagenow)"
):
    """
    Archives the provided URL using archive.org's Wayback Machine, unless
    the page has been recently captured.
    Returns a tuple with the archive.org URL where the capture is stored,
    along with a boolean indicating if a new capture was conducted.
    If the boolean is True, archive.org conducted a new capture. If it is False,
    archive.org has returned a recently cached capture instead, likely taken
    in the previous minutes.
    """
    try:
        return capture(target_url, user_agent=user_agent, accept_cache=False), True
    except CachedPage:
        return capture(target_url, user_agent=user_agent, accept_cache=True), False


class CachedPage(Exception):
    """
    This error is raised when archive.org declines to make a new capture
    and instead returns the cached version of most recent archive.
    """
    pass


class BlockedByRobots(Exception):
    """
    This error is raised when archive.org has been blocked by the site's robots.txt access control instructions.
    """
    pass

def Topic_status_test(topic_id,
                      user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36",
                      proxies=None):
    """
    Test the v2ex topic status.
    Status 403: Forbidden
    Status 404: Not Found
    Status 302: Redirect to index page
    Status 401: Unauthorized
    Status 200: OK
    """
    base_url = "https://www.v2ex.com/t/"
    request_url = urljoin(base_url,str(topic_id))

    headers = {
        'User-Agent': user_agent,
    }

    try:
        response = requests.get(request_url, headers=headers,proxies=proxies)
    except requests.exceptions.RequestException as e:
        raise e

    if response.status_code == 403:
        return 403
    if response.status_code == 404 and '404 Topic Not Found' in response.text:
        return 404
    if response.url == 'https://www.v2ex.com/':
        return 302
    if 'signin' in response.url:
        return 401
    return 200

class TopicStatusError(Exception):
    """
    This error is raised when the topic can't be archived.
    """
    pass

def Save_topic(topic_id,
               proxies_archive=None,
               proxies_v2ex=None):
    """
    Save the v2ex topic to internet archive (https://archive.org/)
    """
    base_url = "https://www.v2ex.com/t/"
    request_url = urljoin(base_url,str(topic_id))
    topic_status = Topic_status_test(topic_id, proxies=proxies_v2ex)
    if topic_status == 200:
        try:
            archive_url = capture(request_url, proxies=proxies_archive)
        except CachedPage as e:
            return e
        return archive_url
    else:
        raise TopicStatusError("The status of topic %s is %d. This topic can't be archived." % (str(topic_id),topic_status))


if __name__ == '__main__':

    # 使用方法：
    # 一、命令行
    # 在可以访问 https://archive.org/ 的情况下，
    # 直接使用 python3 save_v2ex_topic_to_internet_archive.py [topic_id] 即可，
    # 如 python3 save_v2ex_topic_to_internet_archive.py 410301
    #
    # 若不能直接访问 https://archive.org/，
    # 可使用 "--proxies_archive"，"--proxies_v2ex" 选项设置代理，
    # 如 python3 save_v2ex_topic_to_internet_archive.py 410224 --proxies_archive "socks5://127.0.0.1:1080"
    #
    # 二、直接引入
    # import save_v2ex_topic_to_internet_archive as v2ex_archive
    # archive_url = v2ex_archive.Save_topic(topic_id,proxies_archive,proxies_v2ex)
    # print(archive_url)

    import click
    @click.command()
    @click.argument("topic_id")
    @click.option("--proxies_archive",help="proxy for the archive.org request, eg. socks5://127.0.0.1:1080")
    @click.option("--proxies_v2ex",help="proxy for the www.v2ex.com request, eg. socks5://127.0.0.1:1080")
    def cli(topic_id, proxies_archive, proxies_v2ex):
        kwargs = {}
        if proxies_archive:
            proxies_archive = {"http":proxies_archive,"https":proxies_archive}
            kwargs['proxies_archive'] = proxies_archive
        if proxies_v2ex:
            proxies_v2ex = {"http":proxies_v2ex,"https":proxies_v2ex}
            kwargs['proxies_v2ex'] = proxies_v2ex
        archive_url = Save_topic(topic_id, **kwargs)
        click.echo(archive_url)

    cli()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import requests
	from six.moves.urllib.parse import urljoin

	def capture(
	target_url,
	user_agent="savepagenow (https://github.com/pastpages/savepagenow)",
	accept_cache=False,
	proxies=None
	):
	"""
	Archives the provided URL using archive.org's Wayback Machine.
	Returns the archive.org URL where the capture is stored.
	Raises a CachedPage exception if archive.org declines to conduct a new
	capture and returns a previous snapshot instead.
	To silence that exception, pass into True to the ``accept_cache`` keyword
	argument.
	"""
	# Put together the URL that will save our request
	domain = "https://web.archive.org"
	save_url = urljoin(domain, "/save/")
	request_url = save_url + target_url

	# Send the capture request to achive.org
	headers = {
	'User-Agent': user_agent,
	}
	try:
	response = requests.get(request_url, headers=headers,proxies=proxies)
	except requests.exceptions.RequestException as e:
	raise e

	if response.status_code == 403:
	if 'X-Archive-Wayback-Runtime-Error' in response.headers:
	if response.headers['X-Archive-Wayback-Runtime-Error'] == 'RobotAccessControlException: Blocked By Robots':
	raise BlockedByRobots("archive.org returned blocked by robots.txt error")

	# Put together the URL where this page is archived
	archive_id = response.headers['Content-Location']
	archive_url = urljoin(domain, archive_id)

	# Determine if the response was cached
	cached = response.headers['X-Page-Cache'] == 'HIT'
	if cached:
	if not accept_cache:
	raise CachedPage("archive.org returned a cached version of this page: {}".format(
	archive_url
	))

	# Return that
	return archive_url


	def capture_or_cache(
	target_url,
	user_agent="savepagenow (https://github.com/pastpages/savepagenow)"
	):
	"""
	Archives the provided URL using archive.org's Wayback Machine, unless
	the page has been recently captured.
	Returns a tuple with the archive.org URL where the capture is stored,
	along with a boolean indicating if a new capture was conducted.
	If the boolean is True, archive.org conducted a new capture. If it is False,
	archive.org has returned a recently cached capture instead, likely taken
	in the previous minutes.
	"""
	try:
	return capture(target_url, user_agent=user_agent, accept_cache=False), True
	except CachedPage:
	return capture(target_url, user_agent=user_agent, accept_cache=True), False


	class CachedPage(Exception):
	"""
	This error is raised when archive.org declines to make a new capture
	and instead returns the cached version of most recent archive.
	"""
	pass


	class BlockedByRobots(Exception):
	"""
	This error is raised when archive.org has been blocked by the site's robots.txt access control instructions.
	"""
	pass

	def Topic_status_test(topic_id,
	user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36",
	proxies=None):
	"""
	Test the v2ex topic status.
	Status 403: Forbidden
	Status 404: Not Found
	Status 302: Redirect to index page
	Status 401: Unauthorized
	Status 200: OK
	"""
	base_url = "https://www.v2ex.com/t/"
	request_url = urljoin(base_url,str(topic_id))

	headers = {
	'User-Agent': user_agent,
	}

	try:
	response = requests.get(request_url, headers=headers,proxies=proxies)
	except requests.exceptions.RequestException as e:
	raise e

	if response.status_code == 403:
	return 403
	if response.status_code == 404 and '404 Topic Not Found' in response.text:
	return 404
	if response.url == 'https://www.v2ex.com/':
	return 302
	if 'signin' in response.url:
	return 401
	return 200

	class TopicStatusError(Exception):
	"""
	This error is raised when the topic can't be archived.
	"""
	pass

	def Save_topic(topic_id,
	proxies_archive=None,
	proxies_v2ex=None):
	"""
	Save the v2ex topic to internet archive (https://archive.org/)
	"""
	base_url = "https://www.v2ex.com/t/"
	request_url = urljoin(base_url,str(topic_id))
	topic_status = Topic_status_test(topic_id, proxies=proxies_v2ex)
	if topic_status == 200:
	try:
	archive_url = capture(request_url, proxies=proxies_archive)
	except CachedPage as e:
	return e
	return archive_url
	else:
	raise TopicStatusError("The status of topic %s is %d. This topic can't be archived." % (str(topic_id),topic_status))


	if __name__ == '__main__':

	# 使用方法：
	# 一、命令行
	# 在可以访问 https://archive.org/ 的情况下，
	# 直接使用 python3 save_v2ex_topic_to_internet_archive.py [topic_id] 即可，
	# 如 python3 save_v2ex_topic_to_internet_archive.py 410301
	#
	# 若不能直接访问 https://archive.org/，
	# 可使用 "--proxies_archive"，"--proxies_v2ex" 选项设置代理，
	# 如 python3 save_v2ex_topic_to_internet_archive.py 410224 --proxies_archive "socks5://127.0.0.1:1080"
	#
	# 二、直接引入
	# import save_v2ex_topic_to_internet_archive as v2ex_archive
	# archive_url = v2ex_archive.Save_topic(topic_id,proxies_archive,proxies_v2ex)
	# print(archive_url)

	import click
	@click.command()
	@click.argument("topic_id")
	@click.option("--proxies_archive",help="proxy for the archive.org request, eg. socks5://127.0.0.1:1080")
	@click.option("--proxies_v2ex",help="proxy for the www.v2ex.com request, eg. socks5://127.0.0.1:1080")
	def cli(topic_id, proxies_archive, proxies_v2ex):
	kwargs = {}
	if proxies_archive:
	proxies_archive = {"http":proxies_archive,"https":proxies_archive}
	kwargs['proxies_archive'] = proxies_archive
	if proxies_v2ex:
	proxies_v2ex = {"http":proxies_v2ex,"https":proxies_v2ex}
	kwargs['proxies_v2ex'] = proxies_v2ex
	archive_url = Save_topic(topic_id, **kwargs)
	click.echo(archive_url)

	cli()