Skip to content

Instantly share code, notes, and snippets.

@jafow
Created May 9, 2022 03:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jafow/b198fedbe015b70b677e02372fbbfdf9 to your computer and use it in GitHub Desktop.
Save jafow/b198fedbe015b70b677e02372fbbfdf9 to your computer and use it in GitHub Desktop.
getting zillow info about CO
import logging
import re
from pathlib import Path
from bs4 import BeautifulSoup
logging.basicConfig()
logger = logging.getLogger()
def get_address(article) -> str:
""" get the <address> tag text content from the article <a> tag """
addr = article.div.a
if addr and hasattr(addr, "address"):
logger.debug("Addr === %s", addr.address.text)
return addr.address.text
logger.debug("No address tag found in this html; %s", addr)
return 'No address found'
def get_price_info(article) -> str:
""" get the price info from an article """
price_info = []
card_regex = re.compile("list-card-heading")
detail = article.find("div", class_=card_regex)
if not detail.div:
# we do not have any data here, just return an empty string
return ''
# grab the main offereing price info
price_info.append(detail.div.text)
# grab any additional room options (e.g 2br, 3br and their respective prices
price_info.extend(
other_room_options.text for other_room_options in detail.ul.children
)
# return a string that is separated by a tab space character
# example:
# The Lodge | 4697 E Louisiana Ave, Denver, CO, $1,405+ 1 bd $2,166+ 2 bds
return '\t'.join(price_info)
def soup_it(html):
# see more about this very good HTML parsing library
# here: https://beautiful-soup-4.readthedocs.io/en/latest/#
soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all("article")
logger.debug("The total count found of articles: %d", len(articles))
for article in articles:
address = get_address(article)
price_info = get_price_info(article)
print(f"{address}, {price_info}")
logger.debug("%s, %s", address, price_info)
def main():
""" The entrypoint to this program. It does
- reads the file
- does a regular expression match for the <article> tags which have the
juicy rental info
- parses the html into some python objects that are nicer to use
- prints out a line of tab-separated data of the
address price extra options and prices
example:
4400 Syracuse | 4400 S Syracuse St, Denver, CO, $1,550+ 1 bd $2,225+ 2 bds $2,930+ 3 bds
"""
print("== where are all the good affordable housing??? ==")
data_file = Path('./DenverZillow.har')
with data_file.open() as f:
# read the entire file into memory
doc = f.read()
# a regular expression to match the HTML <article> tags
matcher = r'<article\W.*>.*</article>'
# collect all of the matching article tags
tags = [article for article in re.findall(matcher, doc)]
# parse them!
for tag in tags:
soup_it(tag)
print("=== welp that's it === ")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment