Skip to content

Instantly share code, notes, and snippets.

@yingziwu
Created November 28, 2017 15:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yingziwu/db721e156f9486e0d3a99bcb0e9257b8 to your computer and use it in GitHub Desktop.
Save yingziwu/db721e156f9486e0d3a99bcb0e9257b8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from six.moves.urllib.parse import urljoin
def capture(
target_url,
user_agent="savepagenow (https://github.com/pastpages/savepagenow)",
accept_cache=False,
proxies=None
):
"""
Archives the provided URL using archive.org's Wayback Machine.
Returns the archive.org URL where the capture is stored.
Raises a CachedPage exception if archive.org declines to conduct a new
capture and returns a previous snapshot instead.
To silence that exception, pass into True to the ``accept_cache`` keyword
argument.
"""
# Put together the URL that will save our request
domain = "https://web.archive.org"
save_url = urljoin(domain, "/save/")
request_url = save_url + target_url
# Send the capture request to achive.org
headers = {
'User-Agent': user_agent,
}
try:
response = requests.get(request_url, headers=headers,proxies=proxies)
except requests.exceptions.RequestException as e:
raise e
if response.status_code == 403:
if 'X-Archive-Wayback-Runtime-Error' in response.headers:
if response.headers['X-Archive-Wayback-Runtime-Error'] == 'RobotAccessControlException: Blocked By Robots':
raise BlockedByRobots("archive.org returned blocked by robots.txt error")
# Put together the URL where this page is archived
archive_id = response.headers['Content-Location']
archive_url = urljoin(domain, archive_id)
# Determine if the response was cached
cached = response.headers['X-Page-Cache'] == 'HIT'
if cached:
if not accept_cache:
raise CachedPage("archive.org returned a cached version of this page: {}".format(
archive_url
))
# Return that
return archive_url
def capture_or_cache(
target_url,
user_agent="savepagenow (https://github.com/pastpages/savepagenow)"
):
"""
Archives the provided URL using archive.org's Wayback Machine, unless
the page has been recently captured.
Returns a tuple with the archive.org URL where the capture is stored,
along with a boolean indicating if a new capture was conducted.
If the boolean is True, archive.org conducted a new capture. If it is False,
archive.org has returned a recently cached capture instead, likely taken
in the previous minutes.
"""
try:
return capture(target_url, user_agent=user_agent, accept_cache=False), True
except CachedPage:
return capture(target_url, user_agent=user_agent, accept_cache=True), False
class CachedPage(Exception):
"""
This error is raised when archive.org declines to make a new capture
and instead returns the cached version of most recent archive.
"""
pass
class BlockedByRobots(Exception):
"""
This error is raised when archive.org has been blocked by the site's robots.txt access control instructions.
"""
pass
def Topic_status_test(topic_id,
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36",
proxies=None):
"""
Test the v2ex topic status.
Status 403: Forbidden
Status 404: Not Found
Status 302: Redirect to index page
Status 401: Unauthorized
Status 200: OK
"""
base_url = "https://www.v2ex.com/t/"
request_url = urljoin(base_url,str(topic_id))
headers = {
'User-Agent': user_agent,
}
try:
response = requests.get(request_url, headers=headers,proxies=proxies)
except requests.exceptions.RequestException as e:
raise e
if response.status_code == 403:
return 403
if response.status_code == 404 and '404 Topic Not Found' in response.text:
return 404
if response.url == 'https://www.v2ex.com/':
return 302
if 'signin' in response.url:
return 401
return 200
class TopicStatusError(Exception):
"""
This error is raised when the topic can't be archived.
"""
pass
def Save_topic(topic_id,
proxies_archive=None,
proxies_v2ex=None):
"""
Save the v2ex topic to internet archive (https://archive.org/)
"""
base_url = "https://www.v2ex.com/t/"
request_url = urljoin(base_url,str(topic_id))
topic_status = Topic_status_test(topic_id, proxies=proxies_v2ex)
if topic_status == 200:
try:
archive_url = capture(request_url, proxies=proxies_archive)
except CachedPage as e:
return e
return archive_url
else:
raise TopicStatusError("The status of topic %s is %d. This topic can't be archived." % (str(topic_id),topic_status))
if __name__ == '__main__':
# 使用方法:
# 一、命令行
# 在可以访问 https://archive.org/ 的情况下,
# 直接使用 python3 save_v2ex_topic_to_internet_archive.py [topic_id] 即可,
# 如 python3 save_v2ex_topic_to_internet_archive.py 410301
#
# 若不能直接访问 https://archive.org/,
# 可使用 "--proxies_archive","--proxies_v2ex" 选项设置代理,
# 如 python3 save_v2ex_topic_to_internet_archive.py 410224 --proxies_archive "socks5://127.0.0.1:1080"
#
# 二、直接引入
# import save_v2ex_topic_to_internet_archive as v2ex_archive
# archive_url = v2ex_archive.Save_topic(topic_id,proxies_archive,proxies_v2ex)
# print(archive_url)
import click
@click.command()
@click.argument("topic_id")
@click.option("--proxies_archive",help="proxy for the archive.org request, eg. socks5://127.0.0.1:1080")
@click.option("--proxies_v2ex",help="proxy for the www.v2ex.com request, eg. socks5://127.0.0.1:1080")
def cli(topic_id, proxies_archive, proxies_v2ex):
kwargs = {}
if proxies_archive:
proxies_archive = {"http":proxies_archive,"https":proxies_archive}
kwargs['proxies_archive'] = proxies_archive
if proxies_v2ex:
proxies_v2ex = {"http":proxies_v2ex,"https":proxies_v2ex}
kwargs['proxies_v2ex'] = proxies_v2ex
archive_url = Save_topic(topic_id, **kwargs)
click.echo(archive_url)
cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment