Skip to content

Instantly share code, notes, and snippets.

@moloch--
Last active November 17, 2021 22:00
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save moloch--/ce04f5623ec3161bb1fd to your computer and use it in GitHub Desktop.
Save moloch--/ce04f5623ec3161bb1fd to your computer and use it in GitHub Desktop.
Criagslist Bot/Parser
#!/usr/bin/env python
######################################
#
# Author: Moloch
#
# Required libs:
# pip install requests
# pip install beautifulsoup4
# pip install PyRSS2Gen
# pip install python-dateutil
######################################
import sys
import logging
import requests
import argparse
import platform
import PyRSS2Gen
import xml.dom.minidom
from urllib import urlencode
from urlparse import urljoin
from datetime import datetime
from bs4 import BeautifulSoup
from dateutil import parser as dateparser
SEARCH_URL = 'https://sfbay.craigslist.org/search/apa/'
if platform.system().lower() in ['linux', 'darwin']:
INFO = "\033[1m\033[36m[*]\033[0m "
WARN = "\033[1m\033[31m[!]\033[0m "
BOLD = "\033[1m"
else:
INFO = "[*] "
WARN = "[!] "
BOLD = ""
def print_info(msg):
''' Clearline and print message '''
sys.stdout.write(chr(27) + '[2K')
sys.stdout.write('\r' + INFO + msg)
sys.stdout.flush()
class Advertisement(object):
headers = {
'User-Agent': "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
}
_soup = None
def __init__(self, area_code, p):
self.area_code = area_code
self.pid = p.attrs['data-pid']
self.geo = p
self.price = p
self._make_soup()
@property
def title(self):
return self._soup.title.text.strip()
@property
def created(self):
posted = self._soup.find('time').attrs['datetime']
return dateparser.parse(posted)
@property
def geo(self):
''' Returns a tuple of lat, long '''
return (self.latitude, self.longitude,)
@geo.setter
def geo(self, p):
self.latitude = float(p.attrs.get('data-latitude', 0))
self.longitude = float(p.attrs.get('data-longitude', 0))
@property
def price(self):
return self._price
@price.setter
def price(self, p):
tag = p.find('span', attrs={"class": "price"})
self._price = int(tag.text.replace('$', '')) if tag is not None else 0
@property
def href(self):
return "https://sfbay.craigslist.org/%s/apa/%s.html" % (self.area_code, self.pid)
@property
def images(self):
''' Returns URLs for related images '''
thumbs = self._soup.find('div', attrs={'id': 'thumbs'})
return [
a.attrs['href'] for a in thumbs.find_all('a', attrs={'href': True})
] if thumbs else []
@property
def description(self):
body = self._soup.find('section', attrs={'id': 'postingbody'})
text = [tag.text for tag in body.children if hasattr(tag, 'text')]
return ''.join(text)
def _make_soup(self):
resp = requests.get(self.href, headers=self.headers)
self._soup = BeautifulSoup(resp.text, "html5lib")
def __cmp__(self, other):
if other.price < self.price:
return 1
elif self.price == other.price:
return 0
else:
return -1
def __eq__(self, other):
return self.pid == other.pid
def __ne__(self, other):
return not self == other
def __str__(self):
return self.title
def __repr__(self):
return '<Advertisement price: $%s, geo: %s, href: %s>' % (
self.price, self.geo, self.href
)
class Craigslist(object):
headers = {
'User-Agent': "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
}
area_codes = {
'sf': 'sfc',
'east-bay': 'eby',
}
neigborhood_codes = {
'sf': {
'bayview': 2,
'castro': 4,
'upper market': 4,
'mission district': 18,
'nob hill': 19,
'potrero hill': 25,
'russian hill': 27,
},
'east-bay': {
'berkeley': 48,
'north berkeley': 49,
'berkeley hills': 49,
}
}
def __init__(self, area, min_rent='', max_rent='', beds='', neighborhoods=[], cats=False, dogs=False):
params = {
'minAsk': min_rent,
'maxAsk': max_rent,
'bedrooms': beds,
}
if area not in self.area_codes:
raise NotImplementedError("That area is not implemented yet")
else:
self.area_code = self.area_codes[area]
self.query = SEARCH_URL + self.area_code + '?' + urlencode(params)
for hood in neighborhoods:
self.query += '&nh=%d' % self.neigborhood_codes[area][hood]
if cats:
self.query += '&addTwo=purrr'
if dogs:
self.query += '&addThree=wooof'
self._make_soup(self.query)
self._pages()
@classmethod
def neighborhoods(cls, area):
if area not in cls.neigborhood_codes:
raise NotImplementedError("That area is not implemented yet")
return cls.neigborhood_codes[area].keys()
def _make_soup(self, url):
self._response = requests.get(url, headers=self.headers)
self._soup = BeautifulSoup(self._response.text)
def _pages(self):
self._paragraphs = self._soup.find_all('p', attrs={'data-pid': True})
if 100 < len(self):
for index in range(100, len(self), 100):
self._make_soup(self.query + '&s=%d' % index)
self._paragraphs += self._soup.find_all(
'p', attrs={'data-pid': True})
def __iter__(self):
''' Pull ad links '''
for p in self._paragraphs:
yield Advertisement(self.area_code, p)
def __len__(self):
count = self._soup.find('span', attrs={'class': 'resultcount'})
return int(count.text) if count else 0
def create_rss(craigslist, title, link, description):
''' Instanciate and return an RSS object '''
items = []
for index, ad in enumerate(craigslist):
print_info("Retrieving RSS item %d of %d" %
(index + 1, len(craigslist)))
item = PyRSS2Gen.RSSItem(
title=ad.title,
link=ad.href,
description=ad.description,
guid=PyRSS2Gen.Guid(ad.href),
pubDate=ad.created,
)
items.append(item)
print_info("Successfully retrieved all items\n")
return PyRSS2Gen.RSS2(
title=title,
link=link,
description=description,
lastBuildDate=datetime.now(),
items=items,
)
if __name__ == '__main__':
def _cli(args):
try:
craig = Craigslist(
area=args.area,
min_rent=args.min,
max_rent=args.max,
beds=args.beds,
cats=args.cats,
dogs=args.dogs,
neighborhoods=Craigslist.neighborhoods(args.area),
)
print_info('Found %d results ...\n' % len(craig))
if not args.rss:
for index, ad in enumerate(craig):
print "%d) $%d - %s" % (index + 1, ad.price, unicode(ad))
if args.links:
print "\t%s" % ad.href
if args.images:
for image in ad.images:
print "\t%s" % image
else:
rss = create_rss(craig, "Craigslist2Rss", "", "")
print_info("Writing data to file ...")
with open(args.rss, "w") as fp:
doc = xml.dom.minidom.parseString(rss.to_xml())
rss_xml = doc.toprettyxml().encode('utf-8')
fp.write(rss_xml)
print_info("Wrote %d byte(s) to %s\n" %
(len(rss_xml), args.rss))
except KeyboardInterrupt:
print chr(27) + '[2K\r' + WARN + 'Stopping ...'
except NotImplementedError as error:
print chr(27) + '[2K\r' + WARN + str(error)
parser = argparse.ArgumentParser(
description='Craigslist parser/scraper',
)
parser.add_argument('--version',
action='version',
version='%(prog)s v0.0.1'
)
parser.add_argument('--verbose', '-v',
help='display verbose output (default: false)',
action='store_true',
dest='verbose',
)
parser.add_argument('--area', '-a',
help='specify a search area %s' % Craigslist.area_codes.keys(),
dest='area',
required=True,
)
parser.add_argument('--beds', '-b',
help='min number of beds',
type=int,
dest='beds',
)
parser.add_argument('--min-rent', '-mn',
help='min monthly rent',
type=int,
default=0,
dest='min',
)
parser.add_argument('--max-rent', '-mx',
help='max monthly rent',
type=int,
default=100000,
dest='max',
)
parser.add_argument('--dogs', '-d',
help='allows dogs',
action='store_true',
dest='dogs',
)
parser.add_argument('--cats', '-c',
help='allows cats',
action='store_true',
dest='cats',
)
parser.add_argument('--images', '-i',
help='display image links',
action='store_true',
dest='images',
)
parser.add_argument('--links', '-l',
help='display links to ad',
action='store_true',
dest='links',
)
parser.add_argument('--rss', '-r',
help='output results to an rss formatted xml file',
dest='rss',
)
args = parser.parse_args()
if args.verbose:
logger = logging.getLogger()
logging.basicConfig(
format='[%(levelname)s] %(asctime)s - %(message)s',
level=logging.DEBUG
)
_cli(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment