Last active
December 29, 2022 12:11
-
-
Save hhsprings/80899a4f1c92ae7d3ba32b174425473a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# (C) Copyright 2022 hhsprings, https://github.com/hhsprings | |
# SPDX-License-Identifier: MIT | |
import io | |
import os | |
import sys | |
import shutil | |
import re | |
import urllib.request | |
from collections import defaultdict | |
__MYNAME__, _ = os.path.splitext( | |
os.path.basename(sys.modules[__name__].__file__)) | |
_CACHE_FILE = os.path.join( | |
os.environ.get("HOME", os.environ.get("USERPROFILE")), | |
".{}".format(__MYNAME__), | |
"licenses.md") | |
if not os.path.exists(os.path.dirname(_CACHE_FILE)): | |
os.makedirs(os.path.dirname(_CACHE_FILE)) | |
def _update_cache(): | |
if os.path.exists(_CACHE_FILE): | |
os.remove(_CACHE_FILE) | |
fn, mes = urllib.request.urlretrieve( | |
"https://raw.githubusercontent.com/spdx/license-list-data/main/licenses.md") | |
shutil.copyfile(fn, _CACHE_FILE) | |
os.remove(fn) | |
def _parse(licenses_md): | |
result = defaultdict(list) | |
contall = [line.rstrip() for line in io.open(licenses_md).readlines()] | |
keys = ["## Licenses with Short Idenifiers", "## Exception List", "## Deprecated Licenses"] | |
cont = iter(contall) | |
while keys: | |
for line in cont: | |
if line == keys[0]: | |
next(cont) | |
key = keys.pop(0)[len("## "):] | |
break | |
# header | |
head = [c.strip() for c in filter(None, next(cont).split("|"))] | |
next(cont) | |
# table contents | |
def _tr(cn, c): | |
# see https://github.com/spdx/license-list-XML/pull/1338/files | |
if cn[-1] == "?": | |
if c == "Y": | |
return True | |
elif c == "N": | |
return False | |
return None | |
return c | |
for line in cont: | |
if not line: | |
break | |
result[key].append( | |
dict( | |
zip( | |
head, | |
[_tr(head[i], c.strip()) | |
for i, c in enumerate(filter(None, line.split("|")))]))) | |
sidmap = {} | |
for line in cont: | |
m = re.match(r"^\[(.*)\]: (.*)", line) | |
sidmap[m.group(1)] = f"https://github.com/spdx/license-list-data/tree/main/{m.group(2)}" | |
# | |
for key in list(result.keys()): | |
for item in result[key]: | |
for k, v in list(item.items()): | |
if not hasattr(v, "encode"): | |
continue | |
m = re.match(r"\[(.*)\]\[\]", v) | |
if m: | |
item.pop(k) | |
item[k] = f"{m.group(1)}" | |
item[f"{k} (text)"] = f"{sidmap.get(m.group(1))}" | |
return result | |
def as_dict(): | |
return _parse(_CACHE_FILE) | |
if __name__ == '__main__': | |
import argparse | |
import json | |
ap = argparse.ArgumentParser() | |
ap.add_argument("--refresh-cache", action="store_true") | |
ap.add_argument("outfile", nargs="?") | |
args = ap.parse_args() | |
if not os.path.exists(_CACHE_FILE) or args.refresh_cache: | |
_update_cache() | |
result = as_dict() | |
if not args.outfile or args.outfile == "-": | |
outfile = sys.stdout | |
else: | |
outfile = io.open(args.outfile, "w", encoding="utf-8") | |
json.dump(result, outfile, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment