Carpetfizz/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Craigslist housing scraper

Still very much a WIP but it gets the job done.
Usage


pip install -r requirements.txt
Replace WATCH_URL and BASE_URL in scraper.py
python3 scraper.py output_file

You may run into errors installing the dependency lxml so refer to their installation guide for troubleshooting.
// TODO:


 Send the WATCH_URL as a command line arg


 Parse BASE_URL from from the WATCH_URL


 Compare data-repost-of attribute to data-pid to filter duplicates


## output_file
TWO BEDROOM APARTMENT IN BERKELEY!
http://sfbay.craigslist.org/eby/apa/5502409578.html
Mon 21 Mar 08:14:23 PM
$3195
/ 2br - 1000ft

Great Remodeled Apt. in top location; few blocks to UCB
http://sfbay.craigslist.org/eby/apa/5458020372.html
Mon 21 Mar 04:53:00 PM
$3300
/ 2br -

Walk to UC Berkeley from this lovely 2 bdr apt. available June 3
http://sfbay.craigslist.org/eby/apa/5502053675.html
Mon 21 Mar 03:30:12 PM
$2400
/ 2br -

Lovely, sunny 3 bdrm, 2 bath apt. near UC Berkeley available June 3
http://sfbay.craigslist.org/eby/apa/5502035413.html
Mon 21 Mar 03:18:30 PM
$3200
/ 3br -

Southside 2 bed/2bath Unit with On-Site Manager and POOL
http://sfbay.craigslist.org/eby/apa/5501969486.html
Mon 21 Mar 03:00:16 PM
$2990
/ 2br -

Elegant 2-story cottage minutes from Cal and BART
http://sfbay.craigslist.org/eby/apa/5479135846.html
Mon 21 Mar 02:58:38 PM
$2995
/ 2br -

... like a 100 of these ...

## requirements.txt
lxml
requests

## scraper.py
from lxml import html
from sys import argv
import requests

WATCH_URL = "http://sfbay.craigslist.org/search/apa?search_distance=1&postal=94720&max_price=3300&bedrooms=2?format=rss"
BASE_URL = "http://sfbay.craigslist.org"

class Listing():

	def __init__(self, _id, date, title, price, housing, link):
		self._id = _id
		self.link = link
		self.date = date
		self.title = title
		self.price = price
		self.housing = housing

	def __repr__(self):
		return "Listing("+self._id+", "+self.date+", "+self.title+", "+self.price+", "+self.housing+", "+self.link+")"

def get_listings(watch_url, base_url):

	page = requests.get(WATCH_URL)
	tree = html.fromstring(page.content)

	listing_list = tree.xpath('//*[@id="searchform"]/div[4]')[0]
	listings_raw = listing_list[1:len(listing_list)-1]
	listings = []

	for listing in listings_raw:

		_id = listing.get("data-pid")
		date = listing[1][1][0].get("title")
		title = listing[1][1][1][0].text
		price = listing[1][2][0].text
		housing = listing[1][2][1].text
		link = BASE_URL+listing[0].get("href")

		listings.append(Listing(_id, date, title, price, housing, link))

	return listings

def write_listings(of):
    all_listings = get_listings(WATCH_URL, BASE_URL)
    for listing in all_listings:
        of.write(listing.title + '\n')
        of.write(listing.link + '\n')
        of.write(listing.date + '\n')
        of.write(listing.price + '\n')
        of.write(listing.housing + '\n')
        of.write('\n')

try:
    output_file = argv[1]
    if output_file:
        of = open(output_file, 'a')
        write_listings(of)
        of.close()
except IndexError:
    print("Missing argument: output file name")
    print(get_listings(WATCH_URL, BASE_URL))
	TWO BEDROOM APARTMENT IN BERKELEY!
	http://sfbay.craigslist.org/eby/apa/5502409578.html
	Mon 21 Mar 08:14:23 PM
	$3195
	/ 2br - 1000ft

	Great Remodeled Apt. in top location; few blocks to UCB
	http://sfbay.craigslist.org/eby/apa/5458020372.html
	Mon 21 Mar 04:53:00 PM
	$3300
	/ 2br -

	Walk to UC Berkeley from this lovely 2 bdr apt. available June 3
	http://sfbay.craigslist.org/eby/apa/5502053675.html
	Mon 21 Mar 03:30:12 PM
	$2400
	/ 2br -

	Lovely, sunny 3 bdrm, 2 bath apt. near UC Berkeley available June 3
	http://sfbay.craigslist.org/eby/apa/5502035413.html
	Mon 21 Mar 03:18:30 PM
	$3200
	/ 3br -

	Southside 2 bed/2bath Unit with On-Site Manager and POOL
	http://sfbay.craigslist.org/eby/apa/5501969486.html
	Mon 21 Mar 03:00:16 PM
	$2990
	/ 2br -

	Elegant 2-story cottage minutes from Cal and BART
	http://sfbay.craigslist.org/eby/apa/5479135846.html
	Mon 21 Mar 02:58:38 PM
	$2995
	/ 2br -

	... like a 100 of these ...
	from lxml import html
	from sys import argv
	import requests

	WATCH_URL = "http://sfbay.craigslist.org/search/apa?search_distance=1&postal=94720&max_price=3300&bedrooms=2?format=rss"
	BASE_URL = "http://sfbay.craigslist.org"

	class Listing():

	def __init__(self, _id, date, title, price, housing, link):
	self._id = _id
	self.link = link
	self.date = date
	self.title = title
	self.price = price
	self.housing = housing

	def __repr__(self):
	return "Listing("+self._id+", "+self.date+", "+self.title+", "+self.price+", "+self.housing+", "+self.link+")"

	def get_listings(watch_url, base_url):

	page = requests.get(WATCH_URL)
	tree = html.fromstring(page.content)

	listing_list = tree.xpath('//*[@id="searchform"]/div[4]')[0]
	listings_raw = listing_list[1:len(listing_list)-1]
	listings = []

	for listing in listings_raw:

	_id = listing.get("data-pid")
	date = listing[1][1][0].get("title")
	title = listing[1][1][1][0].text
	price = listing[1][2][0].text
	housing = listing[1][2][1].text
	link = BASE_URL+listing[0].get("href")

	listings.append(Listing(_id, date, title, price, housing, link))

	return listings

	def write_listings(of):
	all_listings = get_listings(WATCH_URL, BASE_URL)
	for listing in all_listings:
	of.write(listing.title + '\n')
	of.write(listing.link + '\n')
	of.write(listing.date + '\n')
	of.write(listing.price + '\n')
	of.write(listing.housing + '\n')
	of.write('\n')

	try:
	output_file = argv[1]
	if output_file:
	of = open(output_file, 'a')
	write_listings(of)
	of.close()
	except IndexError:
	print("Missing argument: output file name")
	print(get_listings(WATCH_URL, BASE_URL))