Created
May 11, 2024 15:37
-
-
Save s3rgeym/c482295cf3c835df0325b307c81594f7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import functools | |
import os | |
import re | |
import sys | |
from contextlib import suppress | |
import requests | |
from bs4 import BeautifulSoup | |
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" | |
LOC_RE = re.compile(r"(?<=<loc>)[^<>]+") | |
echo = functools.partial(print, file=sys.stderr) | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-ua", "--user-agent", default=DEFAULT_USER_AGENT) | |
args = parser.parse_args() | |
session = requests.Session() | |
session.headers.update({"User-Agent": args.user_agent}) | |
def extract_page_links(response: requests.Response): | |
return [ | |
link | |
for link in LOC_RE.findall(response.text) | |
if re.search(r"/(currencies|exchanges)/", link) | |
] | |
def extract_external_links(response: requests.Response): | |
soup = BeautifulSoup(response.text, "lxml") | |
return [ | |
a["href"] | |
for a in soup.find_all("a", rel="nofollow noopener", href=True, target="_blank") | |
] | |
def main(): | |
r = session.get("https://s3.coinmarketcap.com/generated/sitemaps/main/en.xml") | |
page_links = extract_page_links(r) | |
for link in page_links: | |
echo(f"parse: {link=}") | |
try: | |
r = session.get(link) | |
if links := extract_external_links(r): | |
echo(f"found: {links=}") | |
print(*links, sep=os.linesep, flush=True) | |
except requests.RequestException as ex: | |
echo(f"{ex=}") | |
if __name__ == "__main__": | |
with suppress(KeyboardInterrupt): | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment