shravanasati/spotlight.py

## spotlight.py
from concurrent.futures import Future, ThreadPoolExecutor, as_completed
from pathlib import Path
import re
import httpx
from bs4 import BeautifulSoup

DOMAIN = "https://windows10spotlight.com"
PAGE_ENDPOINT = DOMAIN + "/page"
NPAGES = 1011
NTHREADS = 50
images = Path.cwd() / "images"
images.mkdir(exist_ok=True)


def get_image_links_from_page(page: int):
    print("Fetching page", page)
    resp = httpx.get(f"{PAGE_ENDPOINT}/{page}", follow_redirects=True)
    if resp.status_code != 200:
        print("Failed to load page:", resp.status_code)
        return set()

    soup = BeautifulSoup(resp.content, "html5lib")
    links = []
    for image in soup.find_all("img"):
        src = image["src"]
        if "windows10spotlight" not in src:
            continue
        links.append(re.sub(r"-\d+x\d+", "", src))

    return links


def download_image(link: str):
    filename = link.split("/")[-1]
    resp = httpx.get(link)
    with open(str(images / filename), "wb") as f:
        f.write(resp.content)


if __name__ == "__main__":
    links = []
    with ThreadPoolExecutor(max_workers=NTHREADS) as pool:
        futures: list[Future[set[str]]] = []
        for i in range(1, NPAGES + 1):
            result = pool.submit(get_image_links_from_page, i)
            futures.append(result)

        for future in as_completed(futures):
            links.extend(future.result())

    with ThreadPoolExecutor(max_workers=NTHREADS) as pool:
        pool.map(download_image, links)
	from concurrent.futures import Future, ThreadPoolExecutor, as_completed
	from pathlib import Path
	import re
	import httpx
	from bs4 import BeautifulSoup

	DOMAIN = "https://windows10spotlight.com"
	PAGE_ENDPOINT = DOMAIN + "/page"
	NPAGES = 1011
	NTHREADS = 50
	images = Path.cwd() / "images"
	images.mkdir(exist_ok=True)


	def get_image_links_from_page(page: int):
	print("Fetching page", page)
	resp = httpx.get(f"{PAGE_ENDPOINT}/{page}", follow_redirects=True)
	if resp.status_code != 200:
	print("Failed to load page:", resp.status_code)
	return set()

	soup = BeautifulSoup(resp.content, "html5lib")
	links = []
	for image in soup.find_all("img"):
	src = image["src"]
	if "windows10spotlight" not in src:
	continue
	links.append(re.sub(r"-\d+x\d+", "", src))

	return links


	def download_image(link: str):
	filename = link.split("/")[-1]
	resp = httpx.get(link)
	with open(str(images / filename), "wb") as f:
	f.write(resp.content)


	if __name__ == "__main__":
	links = []
	with ThreadPoolExecutor(max_workers=NTHREADS) as pool:
	futures: list[Future[set[str]]] = []
	for i in range(1, NPAGES + 1):
	result = pool.submit(get_image_links_from_page, i)
	futures.append(result)

	for future in as_completed(futures):
	links.extend(future.result())

	with ThreadPoolExecutor(max_workers=NTHREADS) as pool:
	pool.map(download_image, links)