Skip to content

Instantly share code, notes, and snippets.

@e2thenegpii
Created August 20, 2021 04:38
Show Gist options
  • Save e2thenegpii/c4b8d89d6866f419504ecafcdd20eb4f to your computer and use it in GitHub Desktop.
Save e2thenegpii/c4b8d89d6866f419504ecafcdd20eb4f to your computer and use it in GitHub Desktop.
combine packaging and aiohttp to discover valid distributions
import asyncio
from contextlib import closing
from typing import Iterator, Dict, Set, AsyncIterator, Iterable, Mapping, Container
from urllib.parse import urlparse, urlunparse
from collections import defaultdict
from pprint import pprint
import aiohttp
from packaging.requirements import Requirement
from packaging.utils import parse_wheel_filename, parse_sdist_filename
from lxml import etree
from yarl import URL
import pdb
async def get_anchor_tags(response: aiohttp.ClientResponse) -> AsyncIterator[etree.Element]:
with closing(etree.HTMLPullParser()) as parser:
async for data in response.content.iter_any():
#pdb.set_trace()
parser.feed(data)
for _, element in parser.read_events():
if element.tag == 'a':
yield element
async def get_candidate_urls(response: aiohttp.ClientResponse) -> AsyncIterator[URL]:
async for element in get_anchor_tags(response):
href = element.attrib.get('href', None)
if href:
url = URL(href)
if not url.is_absolute():
url = response.url.with_path(href)
yield url
async def filter_by_requirement(requirement: Requirement, response: aiohttp.ClientResponse) -> AsyncIterator[URL]:
async for url in get_candidate_urls(response):
if url.name.endswith('.whl'):
name, version, build, tags = parse_wheel_filename(url.name)
elif url.name.endswith('.tar.gz') or url.name.endswith('.zip'):
name, version = parse_sdist_filename(url.name)
else:
# TODO log a warning
continue
if version in requirement.specifier:
yield url
async def get_from_index(index: URL, requirements: Iterator[Requirement]) -> Mapping[Requirement, Container[URL]]:
reqs: Dict[Requirement, Set[URL]] = defaultdict(set)
async with aiohttp.ClientSession() as session:
for req in requirements:
async with session.get(index.with_path(f"simple/{req.name}")) as response:
reqs[req].update({url async for url in filter_by_requirement(req, response)})
return reqs
async def main(index: URL, requirements: Iterator[Requirement]) -> None:
reqs = await get_from_index(index, (x for x in requirements if x.marker is None or x.marker.evaluate()))
for req, candidate_urls in reqs.items():
#pprint(candidate_urls)
pprint({req, len(candidate_urls)})
reqs = [
'setuptools',
'flake8',
'pylint>=2.9 ; python_version >= "3.8"',
'requests',
'mypy',
]
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main(URL("https://pypi.python.org/"), (Requirement(x) for x in reqs)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment