Created
May 3, 2022 08:26
-
-
Save greyblue9/38e2f171c0247eff5a60a6480be932a1 to your computer and use it in GitHub Desktop.
pypi Search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from datetime import datetime | |
from sys import argv | |
from bs4 import BeautifulSoup, GuessedAtParserWarning, Tag | |
from warnings import simplefilter | |
from urllib.request import Request, urlopen | |
from urllib.parse import quote_plus, urljoin | |
simplefilter(category=GuessedAtParserWarning,action="ignore") | |
query = " ".join(argv[1:]) if argv[1:] else "gif" | |
url = "https://pypi.org/search/?q={query}".format(query=quote_plus(query)) | |
with urlopen(Request(method="GET", url=url)) as f: | |
doc = BeautifulSoup(f.read()) | |
def find_name(cur: Tag) -> str: | |
while cur and not ( | |
cur.attrs.get("class") | |
and any(c.startswith("package-") for c in cur["class"]) | |
): | |
cur = cur.parent | |
if not cur: return "" | |
for cls in cur["class"]: | |
if not cls.startswith("package-"): | |
continue | |
k = cls.removeprefix("package-") | |
return k.removeprefix("snippet__") | |
return cur.name.lower() | |
el = doc.select('[class^="package-"]')[0] | |
while el in el.parent.select(":first-child:last-child"): | |
el = el.parent | |
elems = [c for c in el.parent.children if isinstance(c, Tag)] | |
infos = [ | |
{ | |
find_name(e) | |
: | |
next( | |
next( | |
(x.string or x.text).strip() | |
for x in e | |
if (x.string or x.text).strip() | |
) | |
if isinstance(e, Tag) | |
else str(e).strip() | |
for c in e | |
if str(c).strip() | |
) | |
for e in elems.select('[class^="package-"]') | |
} | |
for idx, elems in enumerate(elems) | |
] | |
for idx, elem in enumerate(elems): | |
data = infos[idx] | |
links = ( | |
([elem] if elem.name.lower() == "a" else []) | |
+ list(elem.select("a[href]")) | |
) | |
for link in links: | |
rel_href = link.attrs.get("href") | |
abs_href = urljoin(url, rel_href) | |
k = "url" if "url" not in data else find_name(link) | |
data[k] = abs_href | |
time_elems = elem.select("time[datetime]") | |
for t in time_elems: | |
datetime_s = t["datetime"].replace("+0000","+00:00").replace("Z", "+00:00") | |
datetime_o = datetime.fromisoformat(datetime_s) | |
data[find_name(t)] = datetime_o | |
if "snippet" in data: del data["snippet"] | |
datas = {i["name"]: i for i in infos} | |
from pprint import pp | |
pp(datas) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment