Skip to content

Instantly share code, notes, and snippets.

@greyblue9
Created May 3, 2022 08:26
Show Gist options
  • Save greyblue9/38e2f171c0247eff5a60a6480be932a1 to your computer and use it in GitHub Desktop.
Save greyblue9/38e2f171c0247eff5a60a6480be932a1 to your computer and use it in GitHub Desktop.
pypi Search
#!/usr/bin/env python3
from datetime import datetime
from sys import argv
from bs4 import BeautifulSoup, GuessedAtParserWarning, Tag
from warnings import simplefilter
from urllib.request import Request, urlopen
from urllib.parse import quote_plus, urljoin
simplefilter(category=GuessedAtParserWarning,action="ignore")
query = " ".join(argv[1:]) if argv[1:] else "gif"
url = "https://pypi.org/search/?q={query}".format(query=quote_plus(query))
with urlopen(Request(method="GET", url=url)) as f:
doc = BeautifulSoup(f.read())
def find_name(cur: Tag) -> str:
while cur and not (
cur.attrs.get("class")
and any(c.startswith("package-") for c in cur["class"])
):
cur = cur.parent
if not cur: return ""
for cls in cur["class"]:
if not cls.startswith("package-"):
continue
k = cls.removeprefix("package-")
return k.removeprefix("snippet__")
return cur.name.lower()
el = doc.select('[class^="package-"]')[0]
while el in el.parent.select(":first-child:last-child"):
el = el.parent
elems = [c for c in el.parent.children if isinstance(c, Tag)]
infos = [
{
find_name(e)
:
next(
next(
(x.string or x.text).strip()
for x in e
if (x.string or x.text).strip()
)
if isinstance(e, Tag)
else str(e).strip()
for c in e
if str(c).strip()
)
for e in elems.select('[class^="package-"]')
}
for idx, elems in enumerate(elems)
]
for idx, elem in enumerate(elems):
data = infos[idx]
links = (
([elem] if elem.name.lower() == "a" else [])
+ list(elem.select("a[href]"))
)
for link in links:
rel_href = link.attrs.get("href")
abs_href = urljoin(url, rel_href)
k = "url" if "url" not in data else find_name(link)
data[k] = abs_href
time_elems = elem.select("time[datetime]")
for t in time_elems:
datetime_s = t["datetime"].replace("+0000","+00:00").replace("Z", "+00:00")
datetime_o = datetime.fromisoformat(datetime_s)
data[find_name(t)] = datetime_o
if "snippet" in data: del data["snippet"]
datas = {i["name"]: i for i in infos}
from pprint import pp
pp(datas)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment