Skip to content

Instantly share code, notes, and snippets.

@it-is-wednesday
Last active September 15, 2022 09:35
Show Gist options
  • Save it-is-wednesday/371f59a1e33cca6e3afcc77720f63b51 to your computer and use it in GitHub Desktop.
Save it-is-wednesday/371f59a1e33cca6e3afcc77720f63b51 to your computer and use it in GitHub Desktop.
"""
Finishing writing this script, I found out that Yad2 has aggressive
bot-checking settings, so this script is practically unusable as a background
scraper checking for new apartments, unless you're changing the headers every 2
minutes manually.
It was fun writing it, at least
Requires: requests, beautifulsoup4
"""
import csv
import json
from dataclasses import asdict, dataclass
from io import StringIO
from pathlib import Path
from typing import List, Optional
import requests
from bs4 import BeautifulSoup as bs
ROOMS = 3
MIN_PRICE, MAX_PRICE = 6000, 8000
TLV_CODE = "5000"
YAD_ELIYAHU_CODE = "206"
YAD2_RENT_URL = "https://www.yad2.co.il/realestate/rent"
YAD2_ITEM_BASE_URL = "https://www.yad2.co.il/item"
DEBUG_HTML_FILE = "last_downloaded_page.html"
@dataclass(frozen=True)
class Apartment:
"This class exists only because sets of dicts are illegal"
price: int
address: str
link: str
def parse_shekel(price):
"""
>>> parse_shekel("5,500₪ ")
5500
"""
try:
return int(price.replace(",", "").replace("₪", "").strip())
except ValueError:
return None
def apartment_details(apartment_soup) -> Optional[Apartment]:
"Extracts price, address, and link from apartment row Soup object"
price_raw = apartment_soup.find(attrs={"data-test-id": "item_price"}).text.strip()
if price := parse_shekel(price_raw):
return Apartment(
price=price,
address=next(apartment_soup.find(class_="rows").children).text.strip(),
link=f"{YAD2_ITEM_BASE_URL}/{apartment_soup.attrs['item-id']}",
)
return None
def apartments_to_csv(apartments: List[Apartment]):
"""
Transforms apartment objects to printable CSV
>>> print(apartments_to_csv([Apartment(1, "a", "httpppp://a/")]))
price,address,link
1,a,httpppp://a/
<BLANKLINE>
"""
with StringIO() as str_out:
writer = csv.DictWriter(
str_out, fieldnames=["price", "address", "link"], lineterminator="\n"
)
writer.writeheader()
writer.writerows(map(asdict, apartments))
return str_out.getvalue()
def fetch(city_code, room_count, neighorhood_code, headers) -> str:
return requests.get(
YAD2_RENT_URL,
params={
"propertyGroup": "apartments",
"city": city_code,
"rooms": f"{room_count}-{room_count}",
"neighborhood": neighorhood_code,
},
headers=headers,
).text
def parse_headers(headers_raw: str):
for line in headers_raw.splitlines():
if line.startswith("GET"):
continue
key, value = line.split(":", maxsplit=1)
if key == "Cookie":
continue
yield key, value.strip()
def main():
"Entry point"
headers = dict(parse_headers(Path("./headers").read_text(encoding="UTF-8")))
page = fetch(TLV_CODE, ROOMS, YAD_ELIYAHU_CODE, headers)
apartments_html = bs(page, "html.parser").find_all(class_="feed_item")
if not apartments_html:
Path(DEBUG_HTML_FILE).write_text(page, encoding="UTF-8")
print(
"Hmm... couldn't find any apartments in that page. "
"Try changing headers/cookies?"
)
return
apartments = {
aprt
for a_html in apartments_html
if (aprt := apartment_details(a_html)) and MIN_PRICE < aprt.price < MAX_PRICE
}
existing_apartments_file = Path("./last_results.json")
if existing_apartments_file.exists():
raw_apartments = json.load(existing_apartments_file.open(encoding="UTF-8"))
existing_apartments = set(Apartment(**a) for a in raw_apartments)
else:
existing_apartments = set()
with existing_apartments_file.open("w", encoding="UTF-8") as outfile:
todump = [asdict(a) for a in apartments]
json.dump(todump, outfile, ensure_ascii=False, indent=4)
print(apartments_to_csv(apartments.difference(existing_apartments)))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment