Created
September 23, 2023 10:57
-
-
Save shravanasati/c14f40105815a0f7bdd0ba553fde7b83 to your computer and use it in GitHub Desktop.
simple python scraper to download images from windows10spotlight.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from concurrent.futures import Future, ThreadPoolExecutor, as_completed | |
from pathlib import Path | |
import re | |
import httpx | |
from bs4 import BeautifulSoup | |
DOMAIN = "https://windows10spotlight.com" | |
PAGE_ENDPOINT = DOMAIN + "/page" | |
NPAGES = 1011 | |
NTHREADS = 50 | |
images = Path.cwd() / "images" | |
images.mkdir(exist_ok=True) | |
def get_image_links_from_page(page: int): | |
print("Fetching page", page) | |
resp = httpx.get(f"{PAGE_ENDPOINT}/{page}", follow_redirects=True) | |
if resp.status_code != 200: | |
print("Failed to load page:", resp.status_code) | |
return set() | |
soup = BeautifulSoup(resp.content, "html5lib") | |
links = [] | |
for image in soup.find_all("img"): | |
src = image["src"] | |
if "windows10spotlight" not in src: | |
continue | |
links.append(re.sub(r"-\d+x\d+", "", src)) | |
return links | |
def download_image(link: str): | |
filename = link.split("/")[-1] | |
resp = httpx.get(link) | |
with open(str(images / filename), "wb") as f: | |
f.write(resp.content) | |
if __name__ == "__main__": | |
links = [] | |
with ThreadPoolExecutor(max_workers=NTHREADS) as pool: | |
futures: list[Future[set[str]]] = [] | |
for i in range(1, NPAGES + 1): | |
result = pool.submit(get_image_links_from_page, i) | |
futures.append(result) | |
for future in as_completed(futures): | |
links.extend(future.result()) | |
with ThreadPoolExecutor(max_workers=NTHREADS) as pool: | |
pool.map(download_image, links) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment