Skip to content

Instantly share code, notes, and snippets.

@shravanasati
Created September 23, 2023 10:57
Show Gist options
  • Save shravanasati/c14f40105815a0f7bdd0ba553fde7b83 to your computer and use it in GitHub Desktop.
Save shravanasati/c14f40105815a0f7bdd0ba553fde7b83 to your computer and use it in GitHub Desktop.
simple python scraper to download images from windows10spotlight.com
from concurrent.futures import Future, ThreadPoolExecutor, as_completed
from pathlib import Path
import re
import httpx
from bs4 import BeautifulSoup
DOMAIN = "https://windows10spotlight.com"
PAGE_ENDPOINT = DOMAIN + "/page"
NPAGES = 1011
NTHREADS = 50
images = Path.cwd() / "images"
images.mkdir(exist_ok=True)
def get_image_links_from_page(page: int):
print("Fetching page", page)
resp = httpx.get(f"{PAGE_ENDPOINT}/{page}", follow_redirects=True)
if resp.status_code != 200:
print("Failed to load page:", resp.status_code)
return set()
soup = BeautifulSoup(resp.content, "html5lib")
links = []
for image in soup.find_all("img"):
src = image["src"]
if "windows10spotlight" not in src:
continue
links.append(re.sub(r"-\d+x\d+", "", src))
return links
def download_image(link: str):
filename = link.split("/")[-1]
resp = httpx.get(link)
with open(str(images / filename), "wb") as f:
f.write(resp.content)
if __name__ == "__main__":
links = []
with ThreadPoolExecutor(max_workers=NTHREADS) as pool:
futures: list[Future[set[str]]] = []
for i in range(1, NPAGES + 1):
result = pool.submit(get_image_links_from_page, i)
futures.append(result)
for future in as_completed(futures):
links.extend(future.result())
with ThreadPoolExecutor(max_workers=NTHREADS) as pool:
pool.map(download_image, links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment