Skip to content

Instantly share code, notes, and snippets.

@hhsprings
Last active December 29, 2022 12:11
Show Gist options
  • Save hhsprings/80899a4f1c92ae7d3ba32b174425473a to your computer and use it in GitHub Desktop.
Save hhsprings/80899a4f1c92ae7d3ba32b174425473a to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# (C) Copyright 2022 hhsprings, https://github.com/hhsprings
# SPDX-License-Identifier: MIT
import io
import os
import sys
import shutil
import re
import urllib.request
from collections import defaultdict
__MYNAME__, _ = os.path.splitext(
os.path.basename(sys.modules[__name__].__file__))
_CACHE_FILE = os.path.join(
os.environ.get("HOME", os.environ.get("USERPROFILE")),
".{}".format(__MYNAME__),
"licenses.md")
if not os.path.exists(os.path.dirname(_CACHE_FILE)):
os.makedirs(os.path.dirname(_CACHE_FILE))
def _update_cache():
if os.path.exists(_CACHE_FILE):
os.remove(_CACHE_FILE)
fn, mes = urllib.request.urlretrieve(
"https://raw.githubusercontent.com/spdx/license-list-data/main/licenses.md")
shutil.copyfile(fn, _CACHE_FILE)
os.remove(fn)
def _parse(licenses_md):
result = defaultdict(list)
contall = [line.rstrip() for line in io.open(licenses_md).readlines()]
keys = ["## Licenses with Short Idenifiers", "## Exception List", "## Deprecated Licenses"]
cont = iter(contall)
while keys:
for line in cont:
if line == keys[0]:
next(cont)
key = keys.pop(0)[len("## "):]
break
# header
head = [c.strip() for c in filter(None, next(cont).split("|"))]
next(cont)
# table contents
def _tr(cn, c):
# see https://github.com/spdx/license-list-XML/pull/1338/files
if cn[-1] == "?":
if c == "Y":
return True
elif c == "N":
return False
return None
return c
for line in cont:
if not line:
break
result[key].append(
dict(
zip(
head,
[_tr(head[i], c.strip())
for i, c in enumerate(filter(None, line.split("|")))])))
sidmap = {}
for line in cont:
m = re.match(r"^\[(.*)\]: (.*)", line)
sidmap[m.group(1)] = f"https://github.com/spdx/license-list-data/tree/main/{m.group(2)}"
#
for key in list(result.keys()):
for item in result[key]:
for k, v in list(item.items()):
if not hasattr(v, "encode"):
continue
m = re.match(r"\[(.*)\]\[\]", v)
if m:
item.pop(k)
item[k] = f"{m.group(1)}"
item[f"{k} (text)"] = f"{sidmap.get(m.group(1))}"
return result
def as_dict():
return _parse(_CACHE_FILE)
if __name__ == '__main__':
import argparse
import json
ap = argparse.ArgumentParser()
ap.add_argument("--refresh-cache", action="store_true")
ap.add_argument("outfile", nargs="?")
args = ap.parse_args()
if not os.path.exists(_CACHE_FILE) or args.refresh_cache:
_update_cache()
result = as_dict()
if not args.outfile or args.outfile == "-":
outfile = sys.stdout
else:
outfile = io.open(args.outfile, "w", encoding="utf-8")
json.dump(result, outfile, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment