Last active
June 21, 2023 19:32
-
-
Save it-is-wednesday/c35a4fc566a27f2c178dbe4207608c9e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Requires: requests, beautifulsoup4 | |
TODO: | |
- Logging | |
- Document :) | |
""" | |
import pickle | |
from itertools import count, islice | |
from pathlib import Path | |
from string import Template | |
from typing import Iterable, List, Optional, TypedDict | |
from bs4 import BeautifulSoup | |
from requests import Request, Session | |
USER_AGENT = ( | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0" | |
) | |
LEATHER = "עור" | |
PAGE_NOT_FOUND = "לא ניתן למצוא את הדף הזה" | |
BASE_URL = "https://grass-shoes.co.il/c/women" | |
CATEGORIES = [ | |
"women-footwear/mocassin", | |
"women-footwear/women-sneakers", | |
"women-boots/women-low-heel-boots", | |
"women-booties/women-low-heel-booties", | |
"women-footwear/women-low-heel-flat-shoes", | |
"women-sandals/women-low-heel-flat-sandlas", | |
"women-slippers/women-flat-low-heel-slippers", | |
] | |
CACHE_DIR = Path("cache") | |
SHOE_TEMPLATE = Template( | |
""" | |
<div class="shoe"> | |
<a href="$url"> | |
<img src="$img_url"/> | |
</a> | |
<div>$title</div> | |
<div>$price</div> | |
</div> | |
""" | |
) | |
HTML_OUTPUT_TEMPLATE = Template( | |
""" | |
<html> | |
<head> | |
<style> | |
body { | |
display: flex; | |
flex-flow: wrap; | |
} | |
.shoe { | |
align-items: center; | |
display: flex; | |
flex-direction: column; | |
height: 350px; | |
overflow: hidden; | |
} | |
.shoe a { | |
margin-top: -200px; | |
} | |
</style> | |
</head> | |
<body>$shoes</body> | |
</html> | |
""" | |
) | |
class Shoe(TypedDict): | |
title: str | |
img_url: str | |
url: str | |
price: int | |
sizes: List[str] | |
def parse_shoe(shoe) -> Optional[Shoe]: | |
sizes = shoe.find(attrs={"data-attribute": "pa_size"}).find_all( | |
class_="st-custom-attribute" | |
) | |
price_text = shoe.find(class_="woocommerce-Price-amount").text.replace("₪", "") | |
price = int(float(price_text)) | |
title = shoe.find("h2") # contains both name and link | |
return dict( | |
title=title.text.strip(), | |
img_url=shoe.find("img").attrs["data-src"], | |
url=title.find("a").attrs["href"], | |
sizes=[size.attrs["data-value"] for size in sizes], | |
price=price, | |
) | |
def shoes_in_page(page: BeautifulSoup) -> Iterable[Shoe]: | |
shoes = page.find_all(class_="content-product") | |
for shoe_element in shoes: | |
shoe = parse_shoe(shoe_element) | |
if shoe and "40" in shoe["sizes"] and LEATHER not in shoe["title"]: | |
yield shoe | |
def make_request(url: str, page: int) -> Request: | |
return Request( | |
method="GET", | |
url=f"{url.strip('/')}/page/{page}", | |
headers={"User-Agent": USER_AGENT}, | |
) | |
def fetch_html_pages(categories: List[str]) -> Iterable[str]: | |
sess = Session() | |
for category in categories: | |
for page_num in count(1): | |
print(f"{category} page {page_num}") | |
cached_page = CACHE_DIR / category / f"page_{page_num}.html" | |
if cached_page.exists(): | |
yield cached_page.read_text() | |
continue | |
req = make_request(f"{BASE_URL}/{category}", page_num).prepare() | |
resp = sess.send(req, timeout=10).text | |
if PAGE_NOT_FOUND in resp: | |
print("Encountered a 404! Onto the next category... <3") | |
break | |
cached_page.parent.mkdir(exist_ok=True, parents=True) | |
cached_page.write_text(resp) | |
yield resp | |
def fetch_shoes(categories: List[str]) -> Iterable[Shoe]: | |
for page_raw in fetch_html_pages(categories): | |
yield from shoes_in_page(BeautifulSoup(page_raw, "html.parser")) | |
# pylint: disable=unused-variable | |
def _dev_cache_responses(num_of_pages=4): | |
with open("cached_responses", "wb") as file: | |
to_dump = islice(fetch_html_pages(CATEGORIES), num_of_pages) | |
pickle.dump(list(to_dump), file) | |
# pylint: disable=unused-variable | |
def _dev_load_cached(): | |
with open("cached_responses", "rb") as file: | |
pages = map(BeautifulSoup, pickle.load(file)) | |
for page in pages: | |
for shoe in shoes_in_page(page): | |
yield shoe | |
def main(): | |
"Entry point" | |
CACHE_DIR.mkdir(exist_ok=True) | |
# and for dev purposes, eval `shoes = list(_dev_load_cached())` instead | |
shoes = fetch_shoes(CATEGORIES) | |
shoes_div = "".join(SHOE_TEMPLATE.substitute(shoe) for shoe in shoes) | |
outpage = HTML_OUTPUT_TEMPLATE.substitute(shoes=shoes_div) | |
with open("hi.html", "w", encoding="UTF-8") as f: | |
f.write(outpage) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment