it-is-wednesday/yad2.py

## yad2.py
"""
Finishing writing this script, I found out that Yad2 has aggressive
bot-checking settings, so this script is practically unusable as a background
scraper checking for new apartments, unless you're changing the headers every 2
minutes manually.
It was fun writing it, at least

Requires: requests, beautifulsoup4
"""
import csv
import json
from dataclasses import asdict, dataclass
from io import StringIO
from pathlib import Path
from typing import List, Optional

import requests
from bs4 import BeautifulSoup as bs

ROOMS = 3
MIN_PRICE, MAX_PRICE = 6000, 8000
TLV_CODE = "5000"
YAD_ELIYAHU_CODE = "206"
YAD2_RENT_URL = "https://www.yad2.co.il/realestate/rent"
YAD2_ITEM_BASE_URL = "https://www.yad2.co.il/item"
DEBUG_HTML_FILE = "last_downloaded_page.html"


@dataclass(frozen=True)
class Apartment:
    "This class exists only because sets of dicts are illegal"
    price: int
    address: str
    link: str


def parse_shekel(price):
    """
    >>> parse_shekel("5,500₪ ")
    5500
    """
    try:
        return int(price.replace(",", "").replace("₪", "").strip())
    except ValueError:
        return None


def apartment_details(apartment_soup) -> Optional[Apartment]:
    "Extracts price, address, and link from apartment row Soup object"
    price_raw = apartment_soup.find(attrs={"data-test-id": "item_price"}).text.strip()
    if price := parse_shekel(price_raw):
        return Apartment(
            price=price,
            address=next(apartment_soup.find(class_="rows").children).text.strip(),
            link=f"{YAD2_ITEM_BASE_URL}/{apartment_soup.attrs['item-id']}",
        )
    return None


def apartments_to_csv(apartments: List[Apartment]):
    """
    Transforms apartment objects to printable CSV

    >>> print(apartments_to_csv([Apartment(1, "a", "httpppp://a/")]))
    price,address,link
    1,a,httpppp://a/
    <BLANKLINE>
    """
    with StringIO() as str_out:
        writer = csv.DictWriter(
            str_out, fieldnames=["price", "address", "link"], lineterminator="\n"
        )
        writer.writeheader()
        writer.writerows(map(asdict, apartments))
        return str_out.getvalue()


def fetch(city_code, room_count, neighorhood_code, headers) -> str:
    return requests.get(
        YAD2_RENT_URL,
        params={
            "propertyGroup": "apartments",
            "city": city_code,
            "rooms": f"{room_count}-{room_count}",
            "neighborhood": neighorhood_code,
        },
        headers=headers,
    ).text


def parse_headers(headers_raw: str):
    for line in headers_raw.splitlines():
        if line.startswith("GET"):
            continue
        key, value = line.split(":", maxsplit=1)
        if key == "Cookie":
            continue
        yield key, value.strip()


def main():
    "Entry point"

    headers = dict(parse_headers(Path("./headers").read_text(encoding="UTF-8")))

    page = fetch(TLV_CODE, ROOMS, YAD_ELIYAHU_CODE, headers)

    apartments_html = bs(page, "html.parser").find_all(class_="feed_item")
    if not apartments_html:
        Path(DEBUG_HTML_FILE).write_text(page, encoding="UTF-8")
        print(
            "Hmm... couldn't find any apartments in that page. "
            "Try changing headers/cookies?"
        )
        return

    apartments = {
        aprt
        for a_html in apartments_html
        if (aprt := apartment_details(a_html)) and MIN_PRICE < aprt.price < MAX_PRICE
    }

    existing_apartments_file = Path("./last_results.json")

    if existing_apartments_file.exists():
        raw_apartments = json.load(existing_apartments_file.open(encoding="UTF-8"))
        existing_apartments = set(Apartment(**a) for a in raw_apartments)
    else:
        existing_apartments = set()

    with existing_apartments_file.open("w", encoding="UTF-8") as outfile:
        todump = [asdict(a) for a in apartments]
        json.dump(todump, outfile, ensure_ascii=False, indent=4)

    print(apartments_to_csv(apartments.difference(existing_apartments)))


if __name__ == "__main__":
    main()
	"""
	Finishing writing this script, I found out that Yad2 has aggressive
	bot-checking settings, so this script is practically unusable as a background
	scraper checking for new apartments, unless you're changing the headers every 2
	minutes manually.
	It was fun writing it, at least

	Requires: requests, beautifulsoup4
	"""
	import csv
	import json
	from dataclasses import asdict, dataclass
	from io import StringIO
	from pathlib import Path
	from typing import List, Optional

	import requests
	from bs4 import BeautifulSoup as bs

	ROOMS = 3
	MIN_PRICE, MAX_PRICE = 6000, 8000
	TLV_CODE = "5000"
	YAD_ELIYAHU_CODE = "206"
	YAD2_RENT_URL = "https://www.yad2.co.il/realestate/rent"
	YAD2_ITEM_BASE_URL = "https://www.yad2.co.il/item"
	DEBUG_HTML_FILE = "last_downloaded_page.html"


	@dataclass(frozen=True)
	class Apartment:
	"This class exists only because sets of dicts are illegal"
	price: int
	address: str
	link: str


	def parse_shekel(price):
	"""
	>>> parse_shekel("5,500₪ ")
	5500
	"""
	try:
	return int(price.replace(",", "").replace("₪", "").strip())
	except ValueError:
	return None


	def apartment_details(apartment_soup) -> Optional[Apartment]:
	"Extracts price, address, and link from apartment row Soup object"
	price_raw = apartment_soup.find(attrs={"data-test-id": "item_price"}).text.strip()
	if price := parse_shekel(price_raw):
	return Apartment(
	price=price,
	address=next(apartment_soup.find(class_="rows").children).text.strip(),
	link=f"{YAD2_ITEM_BASE_URL}/{apartment_soup.attrs['item-id']}",
	)
	return None


	def apartments_to_csv(apartments: List[Apartment]):
	"""
	Transforms apartment objects to printable CSV

	>>> print(apartments_to_csv([Apartment(1, "a", "httpppp://a/")]))
	price,address,link
	1,a,httpppp://a/
	<BLANKLINE>
	"""
	with StringIO() as str_out:
	writer = csv.DictWriter(
	str_out, fieldnames=["price", "address", "link"], lineterminator="\n"
	)
	writer.writeheader()
	writer.writerows(map(asdict, apartments))
	return str_out.getvalue()


	def fetch(city_code, room_count, neighorhood_code, headers) -> str:
	return requests.get(
	YAD2_RENT_URL,
	params={
	"propertyGroup": "apartments",
	"city": city_code,
	"rooms": f"{room_count}-{room_count}",
	"neighborhood": neighorhood_code,
	},
	headers=headers,
	).text


	def parse_headers(headers_raw: str):
	for line in headers_raw.splitlines():
	if line.startswith("GET"):
	continue
	key, value = line.split(":", maxsplit=1)
	if key == "Cookie":
	continue
	yield key, value.strip()


	def main():
	"Entry point"

	headers = dict(parse_headers(Path("./headers").read_text(encoding="UTF-8")))

	page = fetch(TLV_CODE, ROOMS, YAD_ELIYAHU_CODE, headers)

	apartments_html = bs(page, "html.parser").find_all(class_="feed_item")
	if not apartments_html:
	Path(DEBUG_HTML_FILE).write_text(page, encoding="UTF-8")
	print(
	"Hmm... couldn't find any apartments in that page. "
	"Try changing headers/cookies?"
	)
	return

	apartments = {
	aprt
	for a_html in apartments_html
	if (aprt := apartment_details(a_html)) and MIN_PRICE < aprt.price < MAX_PRICE
	}

	existing_apartments_file = Path("./last_results.json")

	if existing_apartments_file.exists():
	raw_apartments = json.load(existing_apartments_file.open(encoding="UTF-8"))
	existing_apartments = set(Apartment(**a) for a in raw_apartments)
	else:
	existing_apartments = set()

	with existing_apartments_file.open("w", encoding="UTF-8") as outfile:
	todump = [asdict(a) for a in apartments]
	json.dump(todump, outfile, ensure_ascii=False, indent=4)

	print(apartments_to_csv(apartments.difference(existing_apartments)))


	if __name__ == "__main__":
	main()