jafow/zillow.py

## zillow.py
import logging
import re
from pathlib import Path
from bs4 import BeautifulSoup

logging.basicConfig()
logger = logging.getLogger()


def get_address(article) -> str:
    """ get the <address> tag text content from the article <a> tag """
    addr = article.div.a
    if addr and hasattr(addr, "address"):
        logger.debug("Addr === %s", addr.address.text)
        return addr.address.text

    logger.debug("No address tag found in this html; %s", addr)
    return 'No address found'


def get_price_info(article) -> str:
    """ get the price info from an article """
    price_info = []
    card_regex = re.compile("list-card-heading")
    detail = article.find("div", class_=card_regex)
    if not detail.div:
        # we do not have any data here, just return an empty string
        return ''

    # grab the main offereing price info
    price_info.append(detail.div.text)

    # grab any additional room options (e.g 2br, 3br and their respective prices
    price_info.extend(
        other_room_options.text for other_room_options in detail.ul.children
    )
    # return a string that is separated by a tab space character
    # example:
    # The Lodge | 4697 E Louisiana Ave, Denver, CO, $1,405+ 1 bd    $2,166+ 2 bds
    return '\t'.join(price_info)


def soup_it(html):
    # see more about this very good HTML parsing library
    # here: https://beautiful-soup-4.readthedocs.io/en/latest/#
    soup = BeautifulSoup(html, "html.parser")
    articles = soup.find_all("article")
    logger.debug("The total count found of articles: %d", len(articles))
    for article in articles:
        address = get_address(article)
        price_info = get_price_info(article)
        print(f"{address}, {price_info}")
        logger.debug("%s, %s", address, price_info)


def main():
    """ The entrypoint to this program. It does
    - reads the file
    - does a regular expression match for the <article> tags which have the
        juicy rental info
    - parses the html into some python objects that are nicer to use
    - prints out a line of tab-separated data of the
        address     price   extra options and prices
        example:
        4400 Syracuse | 4400 S Syracuse St, Denver, CO, $1,550+ 1 bd  $2,225+ 2 bds   $2,930+ 3 bds
    """
    print("== where are all the good affordable housing??? ==")
    data_file = Path('./DenverZillow.har')
    with data_file.open() as f:
        # read the entire file into memory
        doc = f.read()

        # a regular expression to match the HTML <article> tags
        matcher = r'<article\W.*>.*</article>'

        # collect all of the matching article tags
        tags = [article for article in re.findall(matcher, doc)]

        # parse them!
        for tag in tags:
            soup_it(tag)

    print("=== welp that's it === ")


if __name__ == '__main__':
    main()
	import logging
	import re
	from pathlib import Path
	from bs4 import BeautifulSoup

	logging.basicConfig()
	logger = logging.getLogger()


	def get_address(article) -> str:
	""" get the <address> tag text content from the article <a> tag """
	addr = article.div.a
	if addr and hasattr(addr, "address"):
	logger.debug("Addr === %s", addr.address.text)
	return addr.address.text

	logger.debug("No address tag found in this html; %s", addr)
	return 'No address found'


	def get_price_info(article) -> str:
	""" get the price info from an article """
	price_info = []
	card_regex = re.compile("list-card-heading")
	detail = article.find("div", class_=card_regex)
	if not detail.div:
	# we do not have any data here, just return an empty string
	return ''

	# grab the main offereing price info
	price_info.append(detail.div.text)

	# grab any additional room options (e.g 2br, 3br and their respective prices
	price_info.extend(
	other_room_options.text for other_room_options in detail.ul.children
	)
	# return a string that is separated by a tab space character
	# example:
	# The Lodge \| 4697 E Louisiana Ave, Denver, CO, $1,405+ 1 bd $2,166+ 2 bds
	return '\t'.join(price_info)


	def soup_it(html):
	# see more about this very good HTML parsing library
	# here: https://beautiful-soup-4.readthedocs.io/en/latest/#
	soup = BeautifulSoup(html, "html.parser")
	articles = soup.find_all("article")
	logger.debug("The total count found of articles: %d", len(articles))
	for article in articles:
	address = get_address(article)
	price_info = get_price_info(article)
	print(f"{address}, {price_info}")
	logger.debug("%s, %s", address, price_info)


	def main():
	""" The entrypoint to this program. It does
	- reads the file
	- does a regular expression match for the <article> tags which have the
	juicy rental info
	- parses the html into some python objects that are nicer to use
	- prints out a line of tab-separated data of the
	address price extra options and prices
	example:
	4400 Syracuse \| 4400 S Syracuse St, Denver, CO, $1,550+ 1 bd $2,225+ 2 bds $2,930+ 3 bds
	"""
	print("== where are all the good affordable housing??? ==")
	data_file = Path('./DenverZillow.har')
	with data_file.open() as f:
	# read the entire file into memory
	doc = f.read()

	# a regular expression to match the HTML <article> tags
	matcher = r'<article\W.>.</article>'

	# collect all of the matching article tags
	tags = [article for article in re.findall(matcher, doc)]

	# parse them!
	for tag in tags:
	soup_it(tag)

	print("=== welp that's it === ")


	if __name__ == '__main__':
	main()