Created
December 15, 2019 21:27
-
-
Save Perishleaf/e8427b1217ff1bfbcc43fc95847c7a0e to your computer and use it in GitHub Desktop.
Define a function for compiling GET request and BeatifulSoup info into array
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Define a function for compiling info into array | |
def getDemography(suburb_names, postcode_list, state='NSW'): | |
Demography_list=[] | |
for suburb, postcode in zip(suburb_names, postcode_list): | |
print(suburb) | |
suburb = suburb.replace(' ', '-') | |
# create the API request URL | |
headers = ({'User-Agent': | |
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}) | |
url = 'https://www.domain.com.au/suburb-profile/{}-nsw-{}'.format( | |
suburb, | |
postcode) | |
# make the GET request | |
response = get(url, headers=headers) | |
# Parse the html | |
html_soup = BeautifulSoup(response.text, 'html.parser') | |
info_containers = html_soup.find_all('div', class_="suburb-profile__row") | |
try: | |
if info_containers != []: | |
demography = info_containers[0].find_all('div', class_="css-jkrtif")[0].find_all('div', class_="css-54bw0x") | |
if demography != []: | |
population = demography[0].text | |
population = population.replace(',', '') | |
age = demography[1].text | |
else: | |
# sometime there will be a promotion section on the result site, hence demography info locates in different section. | |
demography = info_containers[1].find_all('div', class_="css-jkrtif")[0].find_all('div', class_="css-54bw0x") | |
if demography != []: | |
population = demography[0].text | |
population = population.replace(',', '') | |
age = demography[1].text | |
else: | |
# sometimes there will be no infomation. | |
population = "NA" | |
age = "NA" | |
else: | |
# sometimes there is no infomation | |
population = "NA" | |
age = "NA" | |
except: | |
pass | |
# return only relevant information for suburb | |
Demography_list.append([( | |
suburb, | |
postcode, | |
population, | |
age)]) | |
# Wait a given time bewteen 5 to 15 seconds for scraping the next website to mimic a humanbeing search. | |
time.sleep(random.randint(5,15)) | |
return(Demography_list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment