Created
May 9, 2022 03:45
-
-
Save jafow/b198fedbe015b70b677e02372fbbfdf9 to your computer and use it in GitHub Desktop.
getting zillow info about CO
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import re | |
from pathlib import Path | |
from bs4 import BeautifulSoup | |
logging.basicConfig() | |
logger = logging.getLogger() | |
def get_address(article) -> str: | |
""" get the <address> tag text content from the article <a> tag """ | |
addr = article.div.a | |
if addr and hasattr(addr, "address"): | |
logger.debug("Addr === %s", addr.address.text) | |
return addr.address.text | |
logger.debug("No address tag found in this html; %s", addr) | |
return 'No address found' | |
def get_price_info(article) -> str: | |
""" get the price info from an article """ | |
price_info = [] | |
card_regex = re.compile("list-card-heading") | |
detail = article.find("div", class_=card_regex) | |
if not detail.div: | |
# we do not have any data here, just return an empty string | |
return '' | |
# grab the main offereing price info | |
price_info.append(detail.div.text) | |
# grab any additional room options (e.g 2br, 3br and their respective prices | |
price_info.extend( | |
other_room_options.text for other_room_options in detail.ul.children | |
) | |
# return a string that is separated by a tab space character | |
# example: | |
# The Lodge | 4697 E Louisiana Ave, Denver, CO, $1,405+ 1 bd $2,166+ 2 bds | |
return '\t'.join(price_info) | |
def soup_it(html): | |
# see more about this very good HTML parsing library | |
# here: https://beautiful-soup-4.readthedocs.io/en/latest/# | |
soup = BeautifulSoup(html, "html.parser") | |
articles = soup.find_all("article") | |
logger.debug("The total count found of articles: %d", len(articles)) | |
for article in articles: | |
address = get_address(article) | |
price_info = get_price_info(article) | |
print(f"{address}, {price_info}") | |
logger.debug("%s, %s", address, price_info) | |
def main(): | |
""" The entrypoint to this program. It does | |
- reads the file | |
- does a regular expression match for the <article> tags which have the | |
juicy rental info | |
- parses the html into some python objects that are nicer to use | |
- prints out a line of tab-separated data of the | |
address price extra options and prices | |
example: | |
4400 Syracuse | 4400 S Syracuse St, Denver, CO, $1,550+ 1 bd $2,225+ 2 bds $2,930+ 3 bds | |
""" | |
print("== where are all the good affordable housing??? ==") | |
data_file = Path('./DenverZillow.har') | |
with data_file.open() as f: | |
# read the entire file into memory | |
doc = f.read() | |
# a regular expression to match the HTML <article> tags | |
matcher = r'<article\W.*>.*</article>' | |
# collect all of the matching article tags | |
tags = [article for article in re.findall(matcher, doc)] | |
# parse them! | |
for tag in tags: | |
soup_it(tag) | |
print("=== welp that's it === ") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment