Skip to content

Instantly share code, notes, and snippets.

@justvanrossum
Last active August 20, 2020 08:50
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save justvanrossum/ddd59331130793b647cad08b30e6d89a to your computer and use it in GitHub Desktop.
Save justvanrossum/ddd59331130793b647cad08b30e6d89a to your computer and use it in GitHub Desktop.
Scrape OT tag definitions and descriptions from OT spec site (script, language and feature tags)
import os
import re
def parse(data):
start = data.find("<tbody>")
end = data.find("</tbody>")
data = data[start+7:end]
for chunk in re.findall(r"<tr>.+?</tr>", data, re.DOTALL):
fields = re.findall(r"<td>(.+?)</td>", chunk, re.DOTALL)
parsedFields = []
for field in fields:
m = re.search(r'href="(.+?)"', field)
if m is not None:
parsedFields.append(m.group(1))
tagParts = field.split("&#39;")
if len(tagParts) >= 2:
parsedFields.append(tagParts[1])
else:
parsedFields.append(field)
if parsedFields:
yield parsedFields
def formatFeatures(data, baseURL):
print("features = {")
print(" # tag, friendly name, documentation URL")
for link, tag, friendlyName in data:
if tag == 'cv01':
tags = [f"cv{i:02d}" for i in range(1, 100)]
else:
tags = [tag]
for tag in tags:
print(f" {tag!r}: ({friendlyName!r}, {baseURL+link!r}),")
print("}")
def formatScripts(data):
print("scripts = {")
print(" # tag, friendly name")
duplicates = {}
for i, (friendlyName, tag) in enumerate(data):
if tag in duplicates:
duplicates[tag] = duplicates[tag] + ", " + friendlyName
data[i] = (None, None) # skip
else:
duplicates[tag] = friendlyName
for _, tag in data:
if tag is None:
continue
friendlyName = duplicates[tag]
print(f" {tag!r}: {friendlyName!r},")
print("}")
def formatLanguages(data):
print("languages = {")
print(" # tag, friendly name, ISO 639 IDs (if applicable)")
for friendlyName, *fields in data:
tag = fields[0]
if len(tag) < 4:
tag += (4 - len(tag)) * " "
assert len(tag) == 4, tag
if len(fields) > 1:
assert len(fields) == 2
isoCodes = [isoCode.strip() for isoCode in fields[1].split(",")]
else:
isoCodes = []
t = (friendlyName,) + tuple(isoCodes)
print(f" {tag!r}: {t},")
print("}")
# https://docs.microsoft.com/en-us/typography/opentype/spec/featurelist
# https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags
# https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
if __name__ == "__main__":
import sys
import time
baseURL = "https://docs.microsoft.com/en-us/typography/opentype/spec/"
if len(sys.argv) > 1:
with open(sys.argv[1]) as f:
html = f.read()
pages = [html]
else:
import urllib.request
pages = []
print(f"# Generated by {os.path.basename(__file__)}")
print("# Scraped from:")
for page in ["featurelist", "scripttags", "languagetags"]:
url = baseURL + page
print(f"# {url}")
with urllib.request.urlopen(url) as fp:
html = fp.read().decode("utf-8", errors="replace")
pages.append(html)
print()
print()
print("__all__ = ['features', 'scripts', 'languages']")
print()
for html in pages:
print()
parsed = list(parse(html))
if "<title>Registered features" in html:
formatFeatures(parsed, baseURL)
elif "<title>Script tags" in html:
formatScripts(parsed)
elif "<title>Language system tags" in html:
formatLanguages(parsed)
else:
assert 0, "huh."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment