Skip to content

Instantly share code, notes, and snippets.

@it-is-wednesday
Last active June 21, 2023 19:32
Show Gist options
  • Save it-is-wednesday/c35a4fc566a27f2c178dbe4207608c9e to your computer and use it in GitHub Desktop.
Save it-is-wednesday/c35a4fc566a27f2c178dbe4207608c9e to your computer and use it in GitHub Desktop.
"""
Requires: requests, beautifulsoup4
TODO:
- Logging
- Document :)
"""
import pickle
from itertools import count, islice
from pathlib import Path
from string import Template
from typing import Iterable, List, Optional, TypedDict
from bs4 import BeautifulSoup
from requests import Request, Session
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0"
)
LEATHER = "עור"
PAGE_NOT_FOUND = "לא ניתן למצוא את הדף הזה"
BASE_URL = "https://grass-shoes.co.il/c/women"
CATEGORIES = [
"women-footwear/mocassin",
"women-footwear/women-sneakers",
"women-boots/women-low-heel-boots",
"women-booties/women-low-heel-booties",
"women-footwear/women-low-heel-flat-shoes",
"women-sandals/women-low-heel-flat-sandlas",
"women-slippers/women-flat-low-heel-slippers",
]
CACHE_DIR = Path("cache")
SHOE_TEMPLATE = Template(
"""
<div class="shoe">
<a href="$url">
<img src="$img_url"/>
</a>
<div>$title</div>
<div>$price</div>
</div>
"""
)
HTML_OUTPUT_TEMPLATE = Template(
"""
<html>
<head>
<style>
body {
display: flex;
flex-flow: wrap;
}
.shoe {
align-items: center;
display: flex;
flex-direction: column;
height: 350px;
overflow: hidden;
}
.shoe a {
margin-top: -200px;
}
</style>
</head>
<body>$shoes</body>
</html>
"""
)
class Shoe(TypedDict):
title: str
img_url: str
url: str
price: int
sizes: List[str]
def parse_shoe(shoe) -> Optional[Shoe]:
sizes = shoe.find(attrs={"data-attribute": "pa_size"}).find_all(
class_="st-custom-attribute"
)
price_text = shoe.find(class_="woocommerce-Price-amount").text.replace("₪", "")
price = int(float(price_text))
title = shoe.find("h2") # contains both name and link
return dict(
title=title.text.strip(),
img_url=shoe.find("img").attrs["data-src"],
url=title.find("a").attrs["href"],
sizes=[size.attrs["data-value"] for size in sizes],
price=price,
)
def shoes_in_page(page: BeautifulSoup) -> Iterable[Shoe]:
shoes = page.find_all(class_="content-product")
for shoe_element in shoes:
shoe = parse_shoe(shoe_element)
if shoe and "40" in shoe["sizes"] and LEATHER not in shoe["title"]:
yield shoe
def make_request(url: str, page: int) -> Request:
return Request(
method="GET",
url=f"{url.strip('/')}/page/{page}",
headers={"User-Agent": USER_AGENT},
)
def fetch_html_pages(categories: List[str]) -> Iterable[str]:
sess = Session()
for category in categories:
for page_num in count(1):
print(f"{category} page {page_num}")
cached_page = CACHE_DIR / category / f"page_{page_num}.html"
if cached_page.exists():
yield cached_page.read_text()
continue
req = make_request(f"{BASE_URL}/{category}", page_num).prepare()
resp = sess.send(req, timeout=10).text
if PAGE_NOT_FOUND in resp:
print("Encountered a 404! Onto the next category... <3")
break
cached_page.parent.mkdir(exist_ok=True, parents=True)
cached_page.write_text(resp)
yield resp
def fetch_shoes(categories: List[str]) -> Iterable[Shoe]:
for page_raw in fetch_html_pages(categories):
yield from shoes_in_page(BeautifulSoup(page_raw, "html.parser"))
# pylint: disable=unused-variable
def _dev_cache_responses(num_of_pages=4):
with open("cached_responses", "wb") as file:
to_dump = islice(fetch_html_pages(CATEGORIES), num_of_pages)
pickle.dump(list(to_dump), file)
# pylint: disable=unused-variable
def _dev_load_cached():
with open("cached_responses", "rb") as file:
pages = map(BeautifulSoup, pickle.load(file))
for page in pages:
for shoe in shoes_in_page(page):
yield shoe
def main():
"Entry point"
CACHE_DIR.mkdir(exist_ok=True)
# and for dev purposes, eval `shoes = list(_dev_load_cached())` instead
shoes = fetch_shoes(CATEGORIES)
shoes_div = "".join(SHOE_TEMPLATE.substitute(shoe) for shoe in shoes)
outpage = HTML_OUTPUT_TEMPLATE.substitute(shoes=shoes_div)
with open("hi.html", "w", encoding="UTF-8") as f:
f.write(outpage)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment