Skip to content

Instantly share code, notes, and snippets.

@peterbe
Created April 10, 2019 20:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peterbe/2f1b2b32dcf00981d4795e058d2b7100 to your computer and use it in GitHub Desktop.
Save peterbe/2f1b2b32dcf00981d4795e058d2b7100 to your computer and use it in GitHub Desktop.
import concurrent.futures
import random
from urllib.parse import urlparse
import pyquery
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests_cache
requests_cache.install_cache(
"requests_cache1", expire_after=60 * 60, allowable_methods=["GET"]
)
def requests_retry_session(
retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504)
):
"""Opinionated wrapper that creates a requests session with a
HTTPAdapter that sets up a Retry policy that includes connection
retries.
If you do the more naive retry by simply setting a number. E.g.::
adapter = HTTPAdapter(max_retries=3)
then it will raise immediately on any connection errors.
Retrying on connection errors guards better on unpredictable networks.
From http://docs.python-requests.org/en/master/api/?highlight=retries#requests.adapters.HTTPAdapter
it says: "By default, Requests does not retry failed connections."
The backoff_factor is documented here:
https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.retry.Retry
A default of retries=3 and backoff_factor=0.3 means it will sleep like::
[0.3, 0.6, 1.2]
""" # noqa
session = requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def get_urls(base_url, max_urls, urls=set()):
response = session.get(base_url)
response.raise_for_status()
html = response.text
doc = pyquery.PyQuery(html)
doc.make_links_absolute(base_url=base_url)
# List
for a in doc("ul.document-list li a").items():
href = a.attr("href")
if "/Archive/" in href:
continue
assert urlparse(base_url).netloc == urlparse(href).netloc, href
urls.add(href)
if len(urls) >= max_urls:
return urls
# Paginator
other_urls = set()
for a in doc("ol.pagination li a").items():
href = a.attr("href")
assert urlparse(base_url).netloc == urlparse(href).netloc, href
if href in other_urls or href in urls:
continue
other_urls.add(href)
for other_url in random.sample(other_urls, min(3, len(other_urls))):
urls.update(get_urls(other_url, max_urls, urls=urls))
if len(urls) > max_urls:
break
return urls
session = requests_retry_session()
def probe(url):
response = session.head(url)
response.raise_for_status()
value = response.headers["x-cache"]
print("{:100} {}".format(urlparse(url).path, value))
return value
def start(samplesize):
root = "https://developer.mozilla.org/en-US/docs/all"
urls = get_urls(root, 1500)
print("Gathered", len(urls), "URLs")
results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = {}
for url in random.sample(urls, samplesize):
futures[executor.submit(probe, url)] = url
for future in concurrent.futures.as_completed(futures):
result = future.result()
results[futures[future]] = result
misses = hits = 0
for value in results.values():
if "Miss from cloudfront" in value:
misses += 1
elif "Hit from cloudfront" in value:
hits += 1
else:
raise NotImplementedError(value)
print("HIT RATIO: {:.1f}%".format(100 * hits / (hits + misses)))
def sanitytest():
import time
for i in range(5):
r = session.head(
"https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math/hypot"
)
print(r.headers["x-cache"])
time.sleep(1)
# sanitytest()
start(100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment