Skip to content

Instantly share code, notes, and snippets.

@mara004
Last active July 14, 2023 11:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mara004/881d0c5a99b8444fd5d1d21a333b70f8 to your computer and use it in GitHub Desktop.
Save mara004/881d0c5a99b8444fd5d1d21a333b70f8 to your computer and use it in GitHub Desktop.
Parse pdfbox versions and build a nice, robust representation
# SPDX-FileCopyrightText: 2023 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: CC-BY-4.0 OR Apache-2.0
import re
from datetime import datetime
from urllib.request import urlopen
from packaging.version import Version
PB_RELEASE_URL = "https://archive.apache.org/dist/pdfbox/"
PB_DISTS_RE = r'<a href="([\d\.]+.+?)/">.+</a>\s+([\d\-]+ [\d:]+)'
PB_DATE_FMT = r"%Y-%m-%d %H:%M"
content = urlopen(PB_RELEASE_URL).read().decode("utf-8")
results = [(Version(m.group(1)), datetime.strptime(m.group(2), PB_DATE_FMT)) for m in re.finditer(PB_DISTS_RE, content)]
ver_dict = {m: [] for m in {v.major for v, _ in results}}
for v, d in results:
ver_dict[v.major].append( (v, d) )
for v in ver_dict.values():
v.sort(key=lambda r: r[1]) # by date
for k, v in ver_dict.items():
print(k, *v, sep="\n", end="\n\n")
latest_v3 = ver_dict[3][-1]
print(latest_v3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment