Skip to content

Instantly share code, notes, and snippets.

@lafftar
Last active August 30, 2020 11:38
Show Gist options
  • Save lafftar/fc03524fd3bdac2ae9a87cae50a3d546 to your computer and use it in GitHub Desktop.
Save lafftar/fc03524fd3bdac2ae9a87cae50a3d546 to your computer and use it in GitHub Desktop.
GTA Cities Real Estate - 27 Cities - Quick Data Study - 1
from requests import Session
from pandas import DataFrame
from bs4 import BeautifulSoup as bs
from time import time
t1 = time()
main_session = Session()
page = main_session.get('https://en.wikipedia.org/wiki/Greater_Toronto_and_Hamilton_Area').content
page = bs(page, 'lxml')
dump = []
for element in page.find_all('table', class_='wikitable sortable')[0].find_all('a', attrs={'href': True})[1:]:
url = f"https://{'-'.join(element.text.split()).lower()}.listing.ca/real-estate-price-history.htm"
print(f"Scraping {element.text}")
resp = main_session.get(url)
if resp.url == 'https://listing.ca':
print(f"{element.text} - {url} got redirected, skipping")
continue
listing_ca_page = bs(resp.content, 'lxml')
avg_prop_price = listing_ca_page.select_one('#right > div.mt30 > div:nth-child(3) '
'> div.rbox > div:nth-child(1)').text.strip() # avg prop price
ten_years_roc = listing_ca_page.select_one('#right > div.mt30 '
'> div:nth-child(4)> div.rbox > table > tr:nth-child(10)'
' > td:nth-child(3) > span').text.strip() # 10 yrs roc
five_years_roc = listing_ca_page.select_one('#right > div.mt30 > div:nth-child(4)'
'> div.rbox > table > tr:nth-child(9) > td:nth-child(3)'
' > span').text.strip() # 5 yr roc
one_year_roc = listing_ca_page.select_one('#right > div.mt30 > div:nth-child(4)>'
' div.rbox > table > tr:nth-child(7) >'
' td:nth-child(3) > span').text.strip() # 1 yr roc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment