Created
February 7, 2020 06:00
-
-
Save IlnarSelimcan/32aab1de5228649db45efc45299ba7f3 to your computer and use it in GitHub Desktop.
An example of me scraping a website using Python3 (with Requests & BeautifulSoup libraries)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## A script to scrape all listings on this site: | |
## https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html | |
## | |
## into the following csv format: | |
## | |
## Name,Address,Amount,Acres,Type,Misc | |
## | |
## e.g. | |
## Name,Address,Amount,Acres,Type,Misc | |
## "L52 Cloutier, Stark, NH","Stark, NH","$27,500","5.16","5 days on Point2 Homes" | |
## | |
## Since some of the data entries contain commas, they all are enclosed in double | |
## brackets. | |
## | |
## The last field is assumed to contain info about since when the listing is | |
## online and on which site. The first five fields are self-explanatory. | |
import requests | |
from bs4 import BeautifulSoup | |
############ | |
## Constants | |
BASE_URL = "https://www.point2homes.com" | |
FIRST_PAGE = "https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html" | |
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +\ | |
' (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} | |
################### | |
## Data Definitions | |
## Listing is Tuple of 6 fields, all are Strings. | |
## interpretation: a land for sale, with a: | |
## - name, | |
## - address, | |
## - amount (price), | |
## - acres, | |
## - type, and | |
## - information about since when the listing is online and on which site. | |
############ | |
## Functions | |
def listings(start_url): | |
"""String -> (Listof Listing) | |
Given the URL of the listings page you want to start from, return Listings | |
from that page and all subsequent pages. | |
""" | |
def recur(page, accum): | |
next_p = next_page(BASE_URL, page) | |
if next_p: | |
return recur(BeautifulSoup(requests.get(next_p, | |
headers=HEADERS).text, | |
'html.parser'), | |
accum + scrape(page)) | |
else: | |
return accum + scrape(page) | |
start_p = BeautifulSoup(requests.get(start_url, | |
headers=HEADERS).text, 'html.parser') | |
return recur(start_p, []) | |
def scrape(page): | |
""" bs4.BeautifulSoup -> (Listof Listing) | |
Given a page with land sale listings, return all listings from it. | |
""" | |
return [class_item_cnt2listing(cic) for cic in page.find_all('div', class_='item-cnt')] | |
def class_item_cnt2listing(cic): | |
""" bs4.element.Tag -> Listing | |
Given a Tag with the class 'item_cnt', extract contents relevant to us. | |
""" | |
name = cic.select('div.address-container')[0].get('data-address') | |
address = ','.join(name.split(',')[1:]).lstrip() | |
amount = cic.select('div.price')[0].get('data-price').replace(' USD', '') | |
acres = cic.select('li.ic-lotsize')[0].text.strip().replace(' ac Lot Size', '') | |
kind = cic.select('li.property-type')[0].text.strip() | |
days_on = cic.select('div.days-on')[0].text.strip() | |
return name, address, amount, acres, kind, days_on | |
def next_page(base_url, cur_page): | |
"""bs4.BeautifulSoup -> String or None | |
Given the base url and a (BeautifulSoup representation of a) listings page, | |
return the URL of the next listings page. | |
""" | |
try: | |
return base_url + cur_page.select('div.pager > ul > li > a.pager-next')[0].get('href') | |
except IndexError: | |
return None | |
def test_next_page(): | |
assert next_page(BASE_URL, | |
BeautifulSoup(requests.get(FIRST_PAGE, headers=HEADERS).text, | |
'html.parser')) ==\ | |
"https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html?page=2" | |
if __name__ == "__main__": | |
res = listings(FIRST_PAGE) | |
print("Name,Address,Amount,Acres,Type,Misc") | |
for l in res: | |
print('"{0}","{1}","{2}","{3}","{4}","{5}"'.format(*l)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment