Last active
September 15, 2022 09:35
-
-
Save it-is-wednesday/371f59a1e33cca6e3afcc77720f63b51 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Finishing writing this script, I found out that Yad2 has aggressive | |
bot-checking settings, so this script is practically unusable as a background | |
scraper checking for new apartments, unless you're changing the headers every 2 | |
minutes manually. | |
It was fun writing it, at least | |
Requires: requests, beautifulsoup4 | |
""" | |
import csv | |
import json | |
from dataclasses import asdict, dataclass | |
from io import StringIO | |
from pathlib import Path | |
from typing import List, Optional | |
import requests | |
from bs4 import BeautifulSoup as bs | |
ROOMS = 3 | |
MIN_PRICE, MAX_PRICE = 6000, 8000 | |
TLV_CODE = "5000" | |
YAD_ELIYAHU_CODE = "206" | |
YAD2_RENT_URL = "https://www.yad2.co.il/realestate/rent" | |
YAD2_ITEM_BASE_URL = "https://www.yad2.co.il/item" | |
DEBUG_HTML_FILE = "last_downloaded_page.html" | |
@dataclass(frozen=True) | |
class Apartment: | |
"This class exists only because sets of dicts are illegal" | |
price: int | |
address: str | |
link: str | |
def parse_shekel(price): | |
""" | |
>>> parse_shekel("5,500₪ ") | |
5500 | |
""" | |
try: | |
return int(price.replace(",", "").replace("₪", "").strip()) | |
except ValueError: | |
return None | |
def apartment_details(apartment_soup) -> Optional[Apartment]: | |
"Extracts price, address, and link from apartment row Soup object" | |
price_raw = apartment_soup.find(attrs={"data-test-id": "item_price"}).text.strip() | |
if price := parse_shekel(price_raw): | |
return Apartment( | |
price=price, | |
address=next(apartment_soup.find(class_="rows").children).text.strip(), | |
link=f"{YAD2_ITEM_BASE_URL}/{apartment_soup.attrs['item-id']}", | |
) | |
return None | |
def apartments_to_csv(apartments: List[Apartment]): | |
""" | |
Transforms apartment objects to printable CSV | |
>>> print(apartments_to_csv([Apartment(1, "a", "httpppp://a/")])) | |
price,address,link | |
1,a,httpppp://a/ | |
<BLANKLINE> | |
""" | |
with StringIO() as str_out: | |
writer = csv.DictWriter( | |
str_out, fieldnames=["price", "address", "link"], lineterminator="\n" | |
) | |
writer.writeheader() | |
writer.writerows(map(asdict, apartments)) | |
return str_out.getvalue() | |
def fetch(city_code, room_count, neighorhood_code, headers) -> str: | |
return requests.get( | |
YAD2_RENT_URL, | |
params={ | |
"propertyGroup": "apartments", | |
"city": city_code, | |
"rooms": f"{room_count}-{room_count}", | |
"neighborhood": neighorhood_code, | |
}, | |
headers=headers, | |
).text | |
def parse_headers(headers_raw: str): | |
for line in headers_raw.splitlines(): | |
if line.startswith("GET"): | |
continue | |
key, value = line.split(":", maxsplit=1) | |
if key == "Cookie": | |
continue | |
yield key, value.strip() | |
def main(): | |
"Entry point" | |
headers = dict(parse_headers(Path("./headers").read_text(encoding="UTF-8"))) | |
page = fetch(TLV_CODE, ROOMS, YAD_ELIYAHU_CODE, headers) | |
apartments_html = bs(page, "html.parser").find_all(class_="feed_item") | |
if not apartments_html: | |
Path(DEBUG_HTML_FILE).write_text(page, encoding="UTF-8") | |
print( | |
"Hmm... couldn't find any apartments in that page. " | |
"Try changing headers/cookies?" | |
) | |
return | |
apartments = { | |
aprt | |
for a_html in apartments_html | |
if (aprt := apartment_details(a_html)) and MIN_PRICE < aprt.price < MAX_PRICE | |
} | |
existing_apartments_file = Path("./last_results.json") | |
if existing_apartments_file.exists(): | |
raw_apartments = json.load(existing_apartments_file.open(encoding="UTF-8")) | |
existing_apartments = set(Apartment(**a) for a in raw_apartments) | |
else: | |
existing_apartments = set() | |
with existing_apartments_file.open("w", encoding="UTF-8") as outfile: | |
todump = [asdict(a) for a in apartments] | |
json.dump(todump, outfile, ensure_ascii=False, indent=4) | |
print(apartments_to_csv(apartments.difference(existing_apartments))) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment