Skip to content

Instantly share code, notes, and snippets.

@dslaw
Last active October 23, 2018 22:22
Show Gist options
  • Save dslaw/95378b17f2a24ce9a81b9b94edc0df48 to your computer and use it in GitHub Desktop.
Save dslaw/95378b17f2a24ce9a81b9b94edc0df48 to your computer and use it in GitHub Desktop.
Extract data element numbers for Univ of Oregon solar data
"""Parse data element numbers as trees."""
# XXX: Second spectral digit is broken - both values are 0!
from dataclasses import dataclass
from lxml import html
from typing import Any, Dict, List
DELIMITER = "\x97" # —
MIN_COLSPAN = 1
URL = "http://solardat.uoregon.edu/DataElementNumbers.html"
@dataclass(frozen=True)
class _Node:
digit: int
description: str
children: List["_Node"]
def parse_table(table: html.HtmlElement) -> List[_Node]:
max_colspan = max(int(row[-1].get("colspan", -1)) for row in table)
if max_colspan <= MIN_COLSPAN:
raise RuntimeError
roots: List[_Node] = []
for row in table:
# There will be either one or two cells. If two, the first
# is just padding. So we always want the last one in the row.
ele = row[-1]
# The row may not have an element number description.
if DELIMITER not in ele.text:
continue
digit, description = ele.text.split(DELIMITER)
description = " ".join(description.split())
node = _Node(int(digit), description, children=[])
colspan = int(ele.get("colspan", MIN_COLSPAN))
depth = max_colspan - colspan
if depth < 0:
raise RuntimeError
siblings = roots
for _ in range(depth):
siblings = siblings[-1].children
siblings.append(node)
return roots
if __name__ == "__main__":
import json
import requests
def convert_node(node: _Node) -> Dict[str, Any]:
return {
"digit": node.digit,
"description": node.description,
"children": list(map(convert_node, node.children)),
}
response = requests.get(URL)
response.raise_for_status()
tree = html.fromstring(response.content)
table_names = [
"solar",
"spectral",
"meteorological",
]
table_xpath = '//table[@cellpadding="4" and @cellspacing="0" and @border="1"]'
tables = tree.xpath(table_xpath)
out: Dict[str, List[_Node]] = {}
for name, table in zip(table_names, tables):
nodes = parse_table(table)
out.update({name: nodes})
print(json.dumps(out, indent=2, default=convert_node))
# Parse into a flat sequence of codes.
def parse_table(table: html.HtmlElement) -> List[_Node]:
max_colspan = max(int(row[-1].get("colspan", -1)) for row in table)
if max_colspan <= MIN_COLSPAN:
raise RuntimeError
codes = []
current = [None] * max_colspan
for row in table:
ele = row[-1]
if DELIMITER not in ele.text:
continue
digit, description = ele.text.split(DELIMITER)
description = " ".join(description.split())
colspan = int(ele.get("colspan", MIN_COLSPAN))
depth = max_colspan - colspan
current[depth] = {
"digit": digit,
"description": description,
}
if depth == len(current) - 1:
codes.append({
"digits": "".join(node["digit"] for node in current]),
"descriptions": [node["description"] for node in current],
}
current[depth] = None
return codes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment