moloch--/gist:ce04f5623ec3161bb1fd

## gistfile1.py
#!/usr/bin/env python
######################################
#
#  Author: Moloch
#
# Required libs:
#    pip install requests
#    pip install beautifulsoup4
#    pip install PyRSS2Gen
#    pip install python-dateutil
######################################

import sys
import logging
import requests
import argparse
import platform
import PyRSS2Gen
import xml.dom.minidom

from urllib import urlencode
from urlparse import urljoin
from datetime import datetime
from bs4 import BeautifulSoup
from dateutil import parser as dateparser


SEARCH_URL = 'https://sfbay.craigslist.org/search/apa/'

if platform.system().lower() in ['linux', 'darwin']:
    INFO = "\033[1m\033[36m[*]\033[0m "
    WARN = "\033[1m\033[31m[!]\033[0m "
    BOLD = "\033[1m"
else:
    INFO = "[*] "
    WARN = "[!] "
    BOLD = ""


def print_info(msg):
    ''' Clearline and print message '''
    sys.stdout.write(chr(27) + '[2K')
    sys.stdout.write('\r' + INFO + msg)
    sys.stdout.flush()


class Advertisement(object):

    headers = {
        'User-Agent': "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
    }
    _soup = None

    def __init__(self, area_code, p):
        self.area_code = area_code
        self.pid = p.attrs['data-pid']
        self.geo = p
        self.price = p
        self._make_soup()

    @property
    def title(self):
        return self._soup.title.text.strip()

    @property
    def created(self):
        posted = self._soup.find('time').attrs['datetime']
        return dateparser.parse(posted)

    @property
    def geo(self):
        ''' Returns a tuple of lat, long '''
        return (self.latitude, self.longitude,)

    @geo.setter
    def geo(self, p):
        self.latitude = float(p.attrs.get('data-latitude', 0))
        self.longitude = float(p.attrs.get('data-longitude', 0))

    @property
    def price(self):
        return self._price

    @price.setter
    def price(self, p):
        tag = p.find('span', attrs={"class": "price"})
        self._price = int(tag.text.replace('$', '')) if tag is not None else 0

    @property
    def href(self):
        return "https://sfbay.craigslist.org/%s/apa/%s.html" % (self.area_code, self.pid)

    @property
    def images(self):
        ''' Returns URLs for related images '''
        thumbs = self._soup.find('div', attrs={'id': 'thumbs'})
        return [
            a.attrs['href'] for a in thumbs.find_all('a', attrs={'href': True})
        ] if thumbs else []

    @property
    def description(self):
        body = self._soup.find('section', attrs={'id': 'postingbody'})
        text = [tag.text for tag in body.children if hasattr(tag, 'text')]
        return ''.join(text)

    def _make_soup(self):
        resp = requests.get(self.href, headers=self.headers)
        self._soup = BeautifulSoup(resp.text, "html5lib")

    def __cmp__(self, other):
        if other.price < self.price:
            return 1
        elif self.price == other.price:
            return 0
        else:
            return -1

    def __eq__(self, other):
        return self.pid == other.pid

    def __ne__(self, other):
        return not self == other

    def __str__(self):
        return self.title

    def __repr__(self):
        return '<Advertisement price: $%s, geo: %s, href: %s>' % (
            self.price, self.geo, self.href
        )


class Craigslist(object):

    headers = {
        'User-Agent': "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
    }

    area_codes = {
        'sf': 'sfc',
        'east-bay': 'eby',
    }

    neigborhood_codes = {
        'sf': {
            'bayview': 2,
            'castro': 4,
            'upper market': 4,
            'mission district': 18,
            'nob hill': 19,
            'potrero hill': 25,
            'russian hill': 27,
        },
        'east-bay': {
            'berkeley': 48,
            'north berkeley': 49,
            'berkeley hills': 49,
        }
    }

    def __init__(self, area, min_rent='', max_rent='', beds='', neighborhoods=[], cats=False, dogs=False):
        params = {
            'minAsk': min_rent,
            'maxAsk': max_rent,
            'bedrooms': beds,
        }
        if area not in self.area_codes:
            raise NotImplementedError("That area is not implemented yet")
        else:
            self.area_code = self.area_codes[area]
        self.query = SEARCH_URL + self.area_code + '?' + urlencode(params)
        for hood in neighborhoods:
            self.query += '&nh=%d' % self.neigborhood_codes[area][hood]
        if cats:
            self.query += '&addTwo=purrr'
        if dogs:
            self.query += '&addThree=wooof'
        self._make_soup(self.query)
        self._pages()

    @classmethod
    def neighborhoods(cls, area):
        if area not in cls.neigborhood_codes:
            raise NotImplementedError("That area is not implemented yet")
        return cls.neigborhood_codes[area].keys()

    def _make_soup(self, url):
        self._response = requests.get(url, headers=self.headers)
        self._soup = BeautifulSoup(self._response.text)

    def _pages(self):
        self._paragraphs = self._soup.find_all('p', attrs={'data-pid': True})
        if 100 < len(self):
            for index in range(100, len(self), 100):
                self._make_soup(self.query + '&s=%d' % index)
                self._paragraphs += self._soup.find_all(
                    'p', attrs={'data-pid': True})

    def __iter__(self):
        ''' Pull ad links '''
        for p in self._paragraphs:
            yield Advertisement(self.area_code, p)

    def __len__(self):
        count = self._soup.find('span', attrs={'class': 'resultcount'})
        return int(count.text) if count else 0


def create_rss(craigslist, title, link, description):
    ''' Instanciate and return an RSS object '''
    items = []
    for index, ad in enumerate(craigslist):
        print_info("Retrieving RSS item %d of %d" %
                   (index + 1, len(craigslist)))
        item = PyRSS2Gen.RSSItem(
            title=ad.title,
            link=ad.href,
            description=ad.description,
            guid=PyRSS2Gen.Guid(ad.href),
            pubDate=ad.created,
        )
        items.append(item)
    print_info("Successfully retrieved all items\n")
    return PyRSS2Gen.RSS2(
        title=title,
        link=link,
        description=description,
        lastBuildDate=datetime.now(),
        items=items,
    )


if __name__ == '__main__':

    def _cli(args):
        try:
            craig = Craigslist(
                area=args.area,
                min_rent=args.min,
                max_rent=args.max,
                beds=args.beds,
                cats=args.cats,
                dogs=args.dogs,
                neighborhoods=Craigslist.neighborhoods(args.area),
            )
            print_info('Found %d results ...\n' % len(craig))
            if not args.rss:
                for index, ad in enumerate(craig):
                    print "%d) $%d - %s" % (index + 1, ad.price, unicode(ad))
                    if args.links:
                        print "\t%s" % ad.href
                    if args.images:
                        for image in ad.images:
                            print "\t%s" % image
            else:
                rss = create_rss(craig, "Craigslist2Rss", "", "")
                print_info("Writing data to file ...")
                with open(args.rss, "w") as fp:
                    doc = xml.dom.minidom.parseString(rss.to_xml())
                    rss_xml = doc.toprettyxml().encode('utf-8')
                    fp.write(rss_xml)
                print_info("Wrote %d byte(s) to %s\n" %
                           (len(rss_xml), args.rss))
        except KeyboardInterrupt:
            print chr(27) + '[2K\r' + WARN + 'Stopping ...'
        except NotImplementedError as error:
            print chr(27) + '[2K\r' + WARN + str(error)

    parser = argparse.ArgumentParser(
        description='Craigslist parser/scraper',
    )
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s v0.0.1'
                        )
    parser.add_argument('--verbose', '-v',
                        help='display verbose output (default: false)',
                        action='store_true',
                        dest='verbose',
                        )
    parser.add_argument('--area', '-a',
                        help='specify a search area %s' % Craigslist.area_codes.keys(),
                        dest='area',
                        required=True,
                        )
    parser.add_argument('--beds', '-b',
                        help='min number of beds',
                        type=int,
                        dest='beds',
                        )
    parser.add_argument('--min-rent', '-mn',
                        help='min monthly rent',
                        type=int,
                        default=0,
                        dest='min',
                        )
    parser.add_argument('--max-rent', '-mx',
                        help='max monthly rent',
                        type=int,
                        default=100000,
                        dest='max',
                        )
    parser.add_argument('--dogs', '-d',
                        help='allows dogs',
                        action='store_true',
                        dest='dogs',
                        )
    parser.add_argument('--cats', '-c',
                        help='allows cats',
                        action='store_true',
                        dest='cats',
                        )
    parser.add_argument('--images', '-i',
                        help='display image links',
                        action='store_true',
                        dest='images',
                        )
    parser.add_argument('--links', '-l',
                        help='display links to ad',
                        action='store_true',
                        dest='links',
                        )
    parser.add_argument('--rss', '-r',
                        help='output results to an rss formatted xml file',
                        dest='rss',
                        )
    args = parser.parse_args()
    if args.verbose:
        logger = logging.getLogger()
        logging.basicConfig(
            format='[%(levelname)s] %(asctime)s - %(message)s',
            level=logging.DEBUG
        )
    _cli(args)
	#!/usr/bin/env python
	######################################
	#
	# Author: Moloch
	#
	# Required libs:
	# pip install requests
	# pip install beautifulsoup4
	# pip install PyRSS2Gen
	# pip install python-dateutil
	######################################

	import sys
	import logging
	import requests
	import argparse
	import platform
	import PyRSS2Gen
	import xml.dom.minidom

	from urllib import urlencode
	from urlparse import urljoin
	from datetime import datetime
	from bs4 import BeautifulSoup
	from dateutil import parser as dateparser


	SEARCH_URL = 'https://sfbay.craigslist.org/search/apa/'

	if platform.system().lower() in ['linux', 'darwin']:
	INFO = "\033[1m\033[36m[*]\033[0m "
	WARN = "\033[1m\033[31m[!]\033[0m "
	BOLD = "\033[1m"
	else:
	INFO = "[*] "
	WARN = "[!] "
	BOLD = ""


	def print_info(msg):
	''' Clearline and print message '''
	sys.stdout.write(chr(27) + '[2K')
	sys.stdout.write('\r' + INFO + msg)
	sys.stdout.flush()


	class Advertisement(object):

	headers = {
	'User-Agent': "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
	}
	_soup = None

	def __init__(self, area_code, p):
	self.area_code = area_code
	self.pid = p.attrs['data-pid']
	self.geo = p
	self.price = p
	self._make_soup()

	@property
	def title(self):
	return self._soup.title.text.strip()

	@property
	def created(self):
	posted = self._soup.find('time').attrs['datetime']
	return dateparser.parse(posted)

	@property
	def geo(self):
	''' Returns a tuple of lat, long '''
	return (self.latitude, self.longitude,)

	@geo.setter
	def geo(self, p):
	self.latitude = float(p.attrs.get('data-latitude', 0))
	self.longitude = float(p.attrs.get('data-longitude', 0))

	@property
	def price(self):
	return self._price

	@price.setter
	def price(self, p):
	tag = p.find('span', attrs={"class": "price"})
	self._price = int(tag.text.replace('$', '')) if tag is not None else 0

	@property
	def href(self):
	return "https://sfbay.craigslist.org/%s/apa/%s.html" % (self.area_code, self.pid)

	@property
	def images(self):
	''' Returns URLs for related images '''
	thumbs = self._soup.find('div', attrs={'id': 'thumbs'})
	return [
	a.attrs['href'] for a in thumbs.find_all('a', attrs={'href': True})
	] if thumbs else []

	@property
	def description(self):
	body = self._soup.find('section', attrs={'id': 'postingbody'})
	text = [tag.text for tag in body.children if hasattr(tag, 'text')]
	return ''.join(text)

	def _make_soup(self):
	resp = requests.get(self.href, headers=self.headers)
	self._soup = BeautifulSoup(resp.text, "html5lib")

	def __cmp__(self, other):
	if other.price < self.price:
	return 1
	elif self.price == other.price:
	return 0
	else:
	return -1

	def __eq__(self, other):
	return self.pid == other.pid

	def __ne__(self, other):
	return not self == other

	def __str__(self):
	return self.title

	def __repr__(self):
	return '<Advertisement price: $%s, geo: %s, href: %s>' % (
	self.price, self.geo, self.href
	)


	class Craigslist(object):

	headers = {
	'User-Agent': "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
	}

	area_codes = {
	'sf': 'sfc',
	'east-bay': 'eby',
	}

	neigborhood_codes = {
	'sf': {
	'bayview': 2,
	'castro': 4,
	'upper market': 4,
	'mission district': 18,
	'nob hill': 19,
	'potrero hill': 25,
	'russian hill': 27,
	},
	'east-bay': {
	'berkeley': 48,
	'north berkeley': 49,
	'berkeley hills': 49,
	}
	}

	def __init__(self, area, min_rent='', max_rent='', beds='', neighborhoods=[], cats=False, dogs=False):
	params = {
	'minAsk': min_rent,
	'maxAsk': max_rent,
	'bedrooms': beds,
	}
	if area not in self.area_codes:
	raise NotImplementedError("That area is not implemented yet")
	else:
	self.area_code = self.area_codes[area]
	self.query = SEARCH_URL + self.area_code + '?' + urlencode(params)
	for hood in neighborhoods:
	self.query += '&nh=%d' % self.neigborhood_codes[area][hood]
	if cats:
	self.query += '&addTwo=purrr'
	if dogs:
	self.query += '&addThree=wooof'
	self._make_soup(self.query)
	self._pages()

	@classmethod
	def neighborhoods(cls, area):
	if area not in cls.neigborhood_codes:
	raise NotImplementedError("That area is not implemented yet")
	return cls.neigborhood_codes[area].keys()

	def _make_soup(self, url):
	self._response = requests.get(url, headers=self.headers)
	self._soup = BeautifulSoup(self._response.text)

	def _pages(self):
	self._paragraphs = self._soup.find_all('p', attrs={'data-pid': True})
	if 100 < len(self):
	for index in range(100, len(self), 100):
	self._make_soup(self.query + '&s=%d' % index)
	self._paragraphs += self._soup.find_all(
	'p', attrs={'data-pid': True})

	def __iter__(self):
	''' Pull ad links '''
	for p in self._paragraphs:
	yield Advertisement(self.area_code, p)

	def __len__(self):
	count = self._soup.find('span', attrs={'class': 'resultcount'})
	return int(count.text) if count else 0


	def create_rss(craigslist, title, link, description):
	''' Instanciate and return an RSS object '''
	items = []
	for index, ad in enumerate(craigslist):
	print_info("Retrieving RSS item %d of %d" %
	(index + 1, len(craigslist)))
	item = PyRSS2Gen.RSSItem(
	title=ad.title,
	link=ad.href,
	description=ad.description,
	guid=PyRSS2Gen.Guid(ad.href),
	pubDate=ad.created,
	)
	items.append(item)
	print_info("Successfully retrieved all items\n")
	return PyRSS2Gen.RSS2(
	title=title,
	link=link,
	description=description,
	lastBuildDate=datetime.now(),
	items=items,
	)


	if __name__ == '__main__':

	def _cli(args):
	try:
	craig = Craigslist(
	area=args.area,
	min_rent=args.min,
	max_rent=args.max,
	beds=args.beds,
	cats=args.cats,
	dogs=args.dogs,
	neighborhoods=Craigslist.neighborhoods(args.area),
	)
	print_info('Found %d results ...\n' % len(craig))
	if not args.rss:
	for index, ad in enumerate(craig):
	print "%d) $%d - %s" % (index + 1, ad.price, unicode(ad))
	if args.links:
	print "\t%s" % ad.href
	if args.images:
	for image in ad.images:
	print "\t%s" % image
	else:
	rss = create_rss(craig, "Craigslist2Rss", "", "")
	print_info("Writing data to file ...")
	with open(args.rss, "w") as fp:
	doc = xml.dom.minidom.parseString(rss.to_xml())
	rss_xml = doc.toprettyxml().encode('utf-8')
	fp.write(rss_xml)
	print_info("Wrote %d byte(s) to %s\n" %
	(len(rss_xml), args.rss))
	except KeyboardInterrupt:
	print chr(27) + '[2K\r' + WARN + 'Stopping ...'
	except NotImplementedError as error:
	print chr(27) + '[2K\r' + WARN + str(error)

	parser = argparse.ArgumentParser(
	description='Craigslist parser/scraper',
	)
	parser.add_argument('--version',
	action='version',
	version='%(prog)s v0.0.1'
	)
	parser.add_argument('--verbose', '-v',
	help='display verbose output (default: false)',
	action='store_true',
	dest='verbose',
	)
	parser.add_argument('--area', '-a',
	help='specify a search area %s' % Craigslist.area_codes.keys(),
	dest='area',
	required=True,
	)
	parser.add_argument('--beds', '-b',
	help='min number of beds',
	type=int,
	dest='beds',
	)
	parser.add_argument('--min-rent', '-mn',
	help='min monthly rent',
	type=int,
	default=0,
	dest='min',
	)
	parser.add_argument('--max-rent', '-mx',
	help='max monthly rent',
	type=int,
	default=100000,
	dest='max',
	)
	parser.add_argument('--dogs', '-d',
	help='allows dogs',
	action='store_true',
	dest='dogs',
	)
	parser.add_argument('--cats', '-c',
	help='allows cats',
	action='store_true',
	dest='cats',
	)
	parser.add_argument('--images', '-i',
	help='display image links',
	action='store_true',
	dest='images',
	)
	parser.add_argument('--links', '-l',
	help='display links to ad',
	action='store_true',
	dest='links',
	)
	parser.add_argument('--rss', '-r',
	help='output results to an rss formatted xml file',
	dest='rss',
	)
	args = parser.parse_args()
	if args.verbose:
	logger = logging.getLogger()
	logging.basicConfig(
	format='[%(levelname)s] %(asctime)s - %(message)s',
	level=logging.DEBUG
	)
	_cli(args)