IlnarSelimcan/scrape_coos_county.py

## scrape_coos_county.py
## A script to scrape all listings on this site:
## https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html
##
## into the following csv format:
##
## Name,Address,Amount,Acres,Type,Misc
##
## e.g.
## Name,Address,Amount,Acres,Type,Misc
## "L52 Cloutier, Stark, NH","Stark, NH","$27,500","5.16","5 days on Point2 Homes"
##
## Since some of the data entries contain commas, they all are enclosed in double
## brackets.
##
## The last field is assumed to contain info about since when the listing is
## online and on which site. The first five fields are self-explanatory.

import requests
from bs4 import BeautifulSoup


############
## Constants


BASE_URL = "https://www.point2homes.com"
FIRST_PAGE = "https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +\
           ' (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}


###################
## Data Definitions


## Listing is Tuple of 6 fields, all are Strings.
## interpretation: a land for sale, with a:
## - name,
## - address,
## - amount (price),
## - acres,
## - type, and
## - information about since when the listing is online and on which site.


############
## Functions


def listings(start_url):
    """String -> (Listof Listing)

    Given the URL of the listings page you want to start from, return Listings
    from that page and all subsequent pages.
    """
    def recur(page, accum):
        next_p = next_page(BASE_URL, page)
        if next_p:
            return recur(BeautifulSoup(requests.get(next_p,
                                                    headers=HEADERS).text,
                                       'html.parser'),
                         accum + scrape(page))
        else:
            return accum + scrape(page)

    start_p = BeautifulSoup(requests.get(start_url,
                                         headers=HEADERS).text, 'html.parser')
    return recur(start_p, [])


def scrape(page):
    """ bs4.BeautifulSoup -> (Listof Listing)

    Given a page with land sale listings, return all listings from it.
    """
    return [class_item_cnt2listing(cic) for cic in page.find_all('div', class_='item-cnt')]


def class_item_cnt2listing(cic):
    """ bs4.element.Tag -> Listing

    Given a Tag with the class 'item_cnt', extract contents relevant to us.
    """
    name = cic.select('div.address-container')[0].get('data-address')
    address = ','.join(name.split(',')[1:]).lstrip()
    amount = cic.select('div.price')[0].get('data-price').replace(' USD', '')
    acres = cic.select('li.ic-lotsize')[0].text.strip().replace(' ac Lot Size', '')
    kind = cic.select('li.property-type')[0].text.strip()
    days_on = cic.select('div.days-on')[0].text.strip()
    return name, address, amount, acres, kind, days_on


def next_page(base_url, cur_page):
    """bs4.BeautifulSoup -> String or None

    Given the base url and a (BeautifulSoup representation of a) listings page,
    return the URL of the next listings page.
    """
    try:
        return base_url + cur_page.select('div.pager > ul > li > a.pager-next')[0].get('href')
    except IndexError:
        return None

def test_next_page():
    assert next_page(BASE_URL,
                     BeautifulSoup(requests.get(FIRST_PAGE, headers=HEADERS).text,
                                   'html.parser')) ==\
           "https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html?page=2"


if __name__ == "__main__":
    res = listings(FIRST_PAGE)
    print("Name,Address,Amount,Acres,Type,Misc")
    for l in res:
        print('"{0}","{1}","{2}","{3}","{4}","{5}"'.format(*l))
	## A script to scrape all listings on this site:
	## https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html
	##
	## into the following csv format:
	##
	## Name,Address,Amount,Acres,Type,Misc
	##
	## e.g.
	## Name,Address,Amount,Acres,Type,Misc
	## "L52 Cloutier, Stark, NH","Stark, NH","$27,500","5.16","5 days on Point2 Homes"
	##
	## Since some of the data entries contain commas, they all are enclosed in double
	## brackets.
	##
	## The last field is assumed to contain info about since when the listing is
	## online and on which site. The first five fields are self-explanatory.

	import requests
	from bs4 import BeautifulSoup


	############
	## Constants


	BASE_URL = "https://www.point2homes.com"
	FIRST_PAGE = "https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html"
	HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +\
	' (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}


	###################
	## Data Definitions


	## Listing is Tuple of 6 fields, all are Strings.
	## interpretation: a land for sale, with a:
	## - name,
	## - address,
	## - amount (price),
	## - acres,
	## - type, and
	## - information about since when the listing is online and on which site.


	############
	## Functions


	def listings(start_url):
	"""String -> (Listof Listing)

	Given the URL of the listings page you want to start from, return Listings
	from that page and all subsequent pages.
	"""
	def recur(page, accum):
	next_p = next_page(BASE_URL, page)
	if next_p:
	return recur(BeautifulSoup(requests.get(next_p,
	headers=HEADERS).text,
	'html.parser'),
	accum + scrape(page))
	else:
	return accum + scrape(page)

	start_p = BeautifulSoup(requests.get(start_url,
	headers=HEADERS).text, 'html.parser')
	return recur(start_p, [])


	def scrape(page):
	""" bs4.BeautifulSoup -> (Listof Listing)

	Given a page with land sale listings, return all listings from it.
	"""
	return [class_item_cnt2listing(cic) for cic in page.find_all('div', class_='item-cnt')]


	def class_item_cnt2listing(cic):
	""" bs4.element.Tag -> Listing

	Given a Tag with the class 'item_cnt', extract contents relevant to us.
	"""
	name = cic.select('div.address-container')[0].get('data-address')
	address = ','.join(name.split(',')[1:]).lstrip()
	amount = cic.select('div.price')[0].get('data-price').replace(' USD', '')
	acres = cic.select('li.ic-lotsize')[0].text.strip().replace(' ac Lot Size', '')
	kind = cic.select('li.property-type')[0].text.strip()
	days_on = cic.select('div.days-on')[0].text.strip()
	return name, address, amount, acres, kind, days_on


	def next_page(base_url, cur_page):
	"""bs4.BeautifulSoup -> String or None

	Given the base url and a (BeautifulSoup representation of a) listings page,
	return the URL of the next listings page.
	"""
	try:
	return base_url + cur_page.select('div.pager > ul > li > a.pager-next')[0].get('href')
	except IndexError:
	return None

	def test_next_page():
	assert next_page(BASE_URL,
	BeautifulSoup(requests.get(FIRST_PAGE, headers=HEADERS).text,
	'html.parser')) ==\
	"https://www.point2homes.com/US/Land-For-Sale/NH/Coos-County.html?page=2"


	if __name__ == "__main__":
	res = listings(FIRST_PAGE)
	print("Name,Address,Amount,Acres,Type,Misc")
	for l in res:
	print('"{0}","{1}","{2}","{3}","{4}","{5}"'.format(*l))