Skip to content

Instantly share code, notes, and snippets.

@mnot
Created February 8, 2022 08:14
Show Gist options
  • Save mnot/7f8de5920f84ef77eea13f81016e3a5f to your computer and use it in GitHub Desktop.
Save mnot/7f8de5920f84ef77eea13f81016e3a5f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
""" Use the W3C API to understand what document licenses are in use."""
import re
import sys
import time
from urllib.parse import urlparse, urlunparse, urljoin
from bs4 import BeautifulSoup, SoupStrainer
import requests
import requests_cache
API_KEY = "REPLACE_ME"
RETRY_MAX = 3
RETRY_WAIT = 2
s = requests_cache.CachedSession("W3C_cache")
ws = re.compile(r"\W+")
COPYRIGHT = SoupStrainer(class_="copyright")
def apifetch(url):
url = urljoin("https://api.w3.org/", url)
fetch_url = urlparse(url)
fetch_url = fetch_url._replace(query=fetch_url.query + f"&apikey={API_KEY}")
fetch_url = urlunparse(fetch_url)
headers = {"accept": "application/json"}
response = fetch(fetch_url, headers=headers)
results = response.json()
if "next" in results.get("_links", {}):
next_uri = results["_links"]["next"]["href"]
next_results = apifetch(next_uri)
results = combine_members(results, next_results)
return results
def htmlfetch(spec_details, retries=0):
url = spec_details["shortlink"]
title = spec_details["title"]
status = spec_details["_links"]["latest-version"]["title"]
group_url = spec_details["_links"]["latest-version"]["href"] + "/deliverers"
group = apifetch(group_url)["_links"].get("deliverers", [{"title": "Unknown"}])[0]["title"]
try:
html = fetch(url)
except AssertionError as why:
sys.stderr.write(f"* {str(why)}\n")
return
soup = BeautifulSoup(html.text, "html.parser", parse_only=COPYRIGHT)
license = findlicense(soup)
print(f"{url}\t{title}\t{group}\t{status}\t{license}")
def fetch(url, headers=None, retries=0):
try:
response = s.get(url, headers=headers)
assert response.status_code in [200, 206], f"{response.status_code} on {url}"
return response
except requests.exceptions.ConnectionError:
if retries > RETRY_MAX:
sys.stderr.write(f"Max retries for {url}; aborting.\n")
sys.exit(1)
time.sleep(RETRY_WAIT)
sys.stderr.write(f"Retrying {url}\n")
return fetch(url, headers=headers, retries=retries + 1)
def findlicense(copyright):
for tag in copyright.find_all("a"):
text = ws.sub(" ", tag.get_text(strip=True).strip().lower())
if text in [
"document use",
"document license",
"document licensing",
"permissive document license",
]:
return tag["href"]
return "Unknown"
def combine_members(a, b):
o = {"_embedded": {}}
for key in o:
for member in a[key]:
o[key][member] = a[key][member] + b[key].get(member, [])
return o
def main():
results = apifetch("/specifications?embed=true")
n = 0
for spec_details in results["_embedded"]["specifications"]:
n += 1
htmlfetch(spec_details)
sys.stderr.write(f"{n}\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment