Skip to content

Instantly share code, notes, and snippets.

@s3rgeym
Created May 11, 2024 15:37
Show Gist options
  • Save s3rgeym/c482295cf3c835df0325b307c81594f7 to your computer and use it in GitHub Desktop.
Save s3rgeym/c482295cf3c835df0325b307c81594f7 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import argparse
import functools
import os
import re
import sys
from contextlib import suppress
import requests
from bs4 import BeautifulSoup
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
LOC_RE = re.compile(r"(?<=<loc>)[^<>]+")
echo = functools.partial(print, file=sys.stderr)
parser = argparse.ArgumentParser()
parser.add_argument("-ua", "--user-agent", default=DEFAULT_USER_AGENT)
args = parser.parse_args()
session = requests.Session()
session.headers.update({"User-Agent": args.user_agent})
def extract_page_links(response: requests.Response):
return [
link
for link in LOC_RE.findall(response.text)
if re.search(r"/(currencies|exchanges)/", link)
]
def extract_external_links(response: requests.Response):
soup = BeautifulSoup(response.text, "lxml")
return [
a["href"]
for a in soup.find_all("a", rel="nofollow noopener", href=True, target="_blank")
]
def main():
r = session.get("https://s3.coinmarketcap.com/generated/sitemaps/main/en.xml")
page_links = extract_page_links(r)
for link in page_links:
echo(f"parse: {link=}")
try:
r = session.get(link)
if links := extract_external_links(r):
echo(f"found: {links=}")
print(*links, sep=os.linesep, flush=True)
except requests.RequestException as ex:
echo(f"{ex=}")
if __name__ == "__main__":
with suppress(KeyboardInterrupt):
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment