Created
April 10, 2019 20:42
-
-
Save peterbe/2f1b2b32dcf00981d4795e058d2b7100 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import concurrent.futures | |
import random | |
from urllib.parse import urlparse | |
import pyquery | |
import requests | |
from requests.adapters import HTTPAdapter | |
from requests.packages.urllib3.util.retry import Retry | |
import requests_cache | |
requests_cache.install_cache( | |
"requests_cache1", expire_after=60 * 60, allowable_methods=["GET"] | |
) | |
def requests_retry_session( | |
retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504) | |
): | |
"""Opinionated wrapper that creates a requests session with a | |
HTTPAdapter that sets up a Retry policy that includes connection | |
retries. | |
If you do the more naive retry by simply setting a number. E.g.:: | |
adapter = HTTPAdapter(max_retries=3) | |
then it will raise immediately on any connection errors. | |
Retrying on connection errors guards better on unpredictable networks. | |
From http://docs.python-requests.org/en/master/api/?highlight=retries#requests.adapters.HTTPAdapter | |
it says: "By default, Requests does not retry failed connections." | |
The backoff_factor is documented here: | |
https://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#urllib3.util.retry.Retry | |
A default of retries=3 and backoff_factor=0.3 means it will sleep like:: | |
[0.3, 0.6, 1.2] | |
""" # noqa | |
session = requests.Session() | |
retry = Retry( | |
total=retries, | |
read=retries, | |
connect=retries, | |
backoff_factor=backoff_factor, | |
status_forcelist=status_forcelist, | |
) | |
adapter = HTTPAdapter(max_retries=retry) | |
session.mount("http://", adapter) | |
session.mount("https://", adapter) | |
return session | |
def get_urls(base_url, max_urls, urls=set()): | |
response = session.get(base_url) | |
response.raise_for_status() | |
html = response.text | |
doc = pyquery.PyQuery(html) | |
doc.make_links_absolute(base_url=base_url) | |
# List | |
for a in doc("ul.document-list li a").items(): | |
href = a.attr("href") | |
if "/Archive/" in href: | |
continue | |
assert urlparse(base_url).netloc == urlparse(href).netloc, href | |
urls.add(href) | |
if len(urls) >= max_urls: | |
return urls | |
# Paginator | |
other_urls = set() | |
for a in doc("ol.pagination li a").items(): | |
href = a.attr("href") | |
assert urlparse(base_url).netloc == urlparse(href).netloc, href | |
if href in other_urls or href in urls: | |
continue | |
other_urls.add(href) | |
for other_url in random.sample(other_urls, min(3, len(other_urls))): | |
urls.update(get_urls(other_url, max_urls, urls=urls)) | |
if len(urls) > max_urls: | |
break | |
return urls | |
session = requests_retry_session() | |
def probe(url): | |
response = session.head(url) | |
response.raise_for_status() | |
value = response.headers["x-cache"] | |
print("{:100} {}".format(urlparse(url).path, value)) | |
return value | |
def start(samplesize): | |
root = "https://developer.mozilla.org/en-US/docs/all" | |
urls = get_urls(root, 1500) | |
print("Gathered", len(urls), "URLs") | |
results = {} | |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: | |
futures = {} | |
for url in random.sample(urls, samplesize): | |
futures[executor.submit(probe, url)] = url | |
for future in concurrent.futures.as_completed(futures): | |
result = future.result() | |
results[futures[future]] = result | |
misses = hits = 0 | |
for value in results.values(): | |
if "Miss from cloudfront" in value: | |
misses += 1 | |
elif "Hit from cloudfront" in value: | |
hits += 1 | |
else: | |
raise NotImplementedError(value) | |
print("HIT RATIO: {:.1f}%".format(100 * hits / (hits + misses))) | |
def sanitytest(): | |
import time | |
for i in range(5): | |
r = session.head( | |
"https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Math/hypot" | |
) | |
print(r.headers["x-cache"]) | |
time.sleep(1) | |
# sanitytest() | |
start(100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment