Skip to content

Instantly share code, notes, and snippets.

@IlnarSelimcan
Created February 7, 2020 06:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save IlnarSelimcan/32aab1de5228649db45efc45299ba7f3 to your computer and use it in GitHub Desktop.
Save IlnarSelimcan/32aab1de5228649db45efc45299ba7f3 to your computer and use it in GitHub Desktop.
An example of me scraping a website using Python3 (with Requests & BeautifulSoup libraries)
## A script to scrape all listings on this site:
## https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html
##
## into the following csv format:
##
## Name,Address,Amount,Acres,Type,Misc
##
## e.g.
## Name,Address,Amount,Acres,Type,Misc
## "L52 Cloutier, Stark, NH","Stark, NH","$27,500","5.16","5 days on Point2 Homes"
##
## Since some of the data entries contain commas, they all are enclosed in double
## brackets.
##
## The last field is assumed to contain info about since when the listing is
## online and on which site. The first five fields are self-explanatory.
import requests
from bs4 import BeautifulSoup
############
## Constants
BASE_URL = "https://www.point2homes.com"
FIRST_PAGE = "https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +\
' (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
###################
## Data Definitions
## Listing is Tuple of 6 fields, all are Strings.
## interpretation: a land for sale, with a:
## - name,
## - address,
## - amount (price),
## - acres,
## - type, and
## - information about since when the listing is online and on which site.
############
## Functions
def listings(start_url):
"""String -> (Listof Listing)
Given the URL of the listings page you want to start from, return Listings
from that page and all subsequent pages.
"""
def recur(page, accum):
next_p = next_page(BASE_URL, page)
if next_p:
return recur(BeautifulSoup(requests.get(next_p,
headers=HEADERS).text,
'html.parser'),
accum + scrape(page))
else:
return accum + scrape(page)
start_p = BeautifulSoup(requests.get(start_url,
headers=HEADERS).text, 'html.parser')
return recur(start_p, [])
def scrape(page):
""" bs4.BeautifulSoup -> (Listof Listing)
Given a page with land sale listings, return all listings from it.
"""
return [class_item_cnt2listing(cic) for cic in page.find_all('div', class_='item-cnt')]
def class_item_cnt2listing(cic):
""" bs4.element.Tag -> Listing
Given a Tag with the class 'item_cnt', extract contents relevant to us.
"""
name = cic.select('div.address-container')[0].get('data-address')
address = ','.join(name.split(',')[1:]).lstrip()
amount = cic.select('div.price')[0].get('data-price').replace(' USD', '')
acres = cic.select('li.ic-lotsize')[0].text.strip().replace(' ac Lot Size', '')
kind = cic.select('li.property-type')[0].text.strip()
days_on = cic.select('div.days-on')[0].text.strip()
return name, address, amount, acres, kind, days_on
def next_page(base_url, cur_page):
"""bs4.BeautifulSoup -> String or None
Given the base url and a (BeautifulSoup representation of a) listings page,
return the URL of the next listings page.
"""
try:
return base_url + cur_page.select('div.pager > ul > li > a.pager-next')[0].get('href')
except IndexError:
return None
def test_next_page():
assert next_page(BASE_URL,
BeautifulSoup(requests.get(FIRST_PAGE, headers=HEADERS).text,
'html.parser')) ==\
"https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html?page=2"
if __name__ == "__main__":
res = listings(FIRST_PAGE)
print("Name,Address,Amount,Acres,Type,Misc")
for l in res:
print('"{0}","{1}","{2}","{3}","{4}","{5}"'.format(*l))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment