Skip to content

Instantly share code, notes, and snippets.

@pjleimbigler
Created May 1, 2020 03:40
Show Gist options
  • Save pjleimbigler/dad8e7be67d17c47488cab5936c09dcb to your computer and use it in GitHub Desktop.
Save pjleimbigler/dad8e7be67d17c47488cab5936c09dcb to your computer and use it in GitHub Desktop.
Python script to scrape LTC home data from http://publicreporting.ltchomes.net/en-ca/Search_Selection.aspx
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
baseurl = 'http://publicreporting.ltchomes.net/en-ca/'
url = baseurl + 'Search_Selection.aspx'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
homelinks = [(x.text, x['href']) for x in soup.find_all("a", class_="rsLink")]
i = 0
dfs = []
for home, link in homelinks:
this_df = {}
this_response = requests.get(baseurl + link)
this_soup = BeautifulSoup(this_response.text, 'html.parser')
# Get address block
address = this_soup.find_all('div', {'class' : 'HomeAddress'})[:4]
# Parse name and address data. Warning: brittle code
tel = address[2].text.split(':')
if len(tel) == 2:
tel = tel[1].strip()
fax = address[3].text.split(':')
if len(fax) == 2:
fax = fax[1].strip()
this_df['Home'] = home
this_df['Address'] = address[0].text
this_df['City'] = address[1].text.split(',')[0]
this_df['Postal_code'] = address[1].text.split(',')[1]
this_df['Tel'] = tel
this_df['Fax'] = fax
# Parse home profile data
col1 = this_soup.find_all('div', class_='Profilerow_col1')
col2 = this_soup.find_all('div', class_='Profilerow_col2')
for k, v in zip(col1, col2):
this_df[k.text] = v.text if v.text is not '' else 'NA'
dfs.append(this_df)
# Pause to avoid DoS and/or ban
time.sleep(0.1)
i += 1
print(i)
df = pd.DataFrame(dfs)
# # Uncomment to (over)write to file
# df.to_csv('ON-LTC-scraped-2020-04-30.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment