Skip to content

Instantly share code, notes, and snippets.

@bbelderbos
Created April 6, 2023 06:12
Show Gist options
  • Save bbelderbos/1b13763e1ba91cb5595cef8c1c9821c0 to your computer and use it in GitHub Desktop.
Save bbelderbos/1b13763e1ba91cb5595cef8c1c9821c0 to your computer and use it in GitHub Desktop.
from pathlib import Path
import concurrent.futures
from fake_useragent import UserAgent
import requests
ARTICLE_ENDPOINT = "https://codechalleng.es/api/articles/"
ARTICLE_LINKS = Path("links")
DOWNLOADS_FOLDER = Path("downloads")
HEADERS = {"User-Agent": str(UserAgent().chrome)}
def get_links():
response = requests.get(ARTICLE_ENDPOINT)
urls = [row["link"] for row in response.json()]
return urls
def _download_url(url):
response = requests.get(url, headers=HEADERS)
filename = url.rstrip("/").split("/")[-1].removesuffix(".html")
path = DOWNLOADS_FOLDER / filename
path.write_text(response.text)
def download_articles(urls):
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(_download_url, url): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
data = future.result()
if __name__ == "__main__":
if not ARTICLE_LINKS.exists():
links = get_links()
ARTICLE_LINKS.write_text("\n".join(links) + "\n")
if not DOWNLOADS_FOLDER.exists():
DOWNLOADS_FOLDER.mkdir()
urls = ARTICLE_LINKS.read_text().splitlines()
download_articles(urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment