starenka/hbazar.py

## hbazar.py
#!/usr/bin/env python
# coding=utf-8

# pip install requests beautifulsoup4
# TODO:   handle category selection (fucked up serverside)
#         fix seldom weird price detection

from __future__ import division
import re
import datetime
import argparse
import itertools
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

import requests
from bs4 import BeautifulSoup

PER_PAGE = 50
URL_BASE = 'http://hudebnibazar.cz'
URL_SEARCH = '?is=1&p={page}&f={term}&n={ad_type}&r=&i=%d&o=datum' % PER_PAGE
URL_CATEGORIES = {'guitarother': '/kytarova-pouzdra-a-prislusenstvi/110600/',
                  'guitarfx': '/kytarove-efekty/110500/',
                  'any': '/vsechny-kategorie/0/', }
AD_TYPES = dict(sell='nabidka', buy='poptavka', other='ruzne')
MONTHS = (u'ledna', u'února', u'března', u'dubna', u'května', u'června',
          u'července', u'srpna', u'září', u'října', u'listopadu', u'prosince')
MONTHS_MAP = dict(zip(MONTHS, range(1, 13)))


def parse_date(date_str):
    today = datetime.date.today()
    date = today if 'dnes' in date_str else today-datetime.timedelta(days=1)
    sdate = re.search(r'(\d+)\.\s+(\w+)', date_str, re.UNICODE)

    if sdate:
        day, month = sdate.groups()
        month_num = MONTHS_MAP[month]
        year = today.year if month_num < today.month else today.year-1
        date = datetime.date(year, month_num, int(day))

    return date


def parse_doc(resp, days, max_price, **kwargs):
    doc = BeautifulSoup(resp.content)
    for one in doc.select('td.InzeratBody'):
        loc_data_ = one.select('div.InzeratKontakt')
        loc_data = loc_data_[0].text if loc_data_ else ''

        date = parse_date(loc_data)

        if days and (datetime.date.today() - date).days > days:
            continue

        img_ = one.select('a.fancybox')
        img = img_[0]['href'] if img_ else None
        title_ = one.select('div.InzeratNadpisSmall')[0]
        title, link = title_.text, URL_BASE + title_.find('a')['href']
        text = one.select('div.InzeratTextSmall')[0].text

        price_ = one.select('div.InzeratCenaSmall')
        price = price_[0].text if price_ else '?'
        czkm = re.search(u'(\d+) Kč', price)
        price_czk = int(float(czkm.groups()[0].replace(',', '.'))) if czkm else 0

        if max_price and not czkm:
            continue

        if max_price and price_czk > max_price:
            continue

        yield title, link, text, img, u'%d Kč' % price_czk if czkm else price, loc_data


def search(term, category='any', ad_type='sell', days=0, max_price=None, **kwargs):
    base_url = URL_BASE + URL_CATEGORIES[category]
    url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=1)
    docs = [requests.get(url)]

    total_ = re.search(r'Celkem\s(\d+)', docs[0].content)
    total = int(total_.groups()[0])//PER_PAGE+1
    total = total if total < 10 else 10  # 500 items should be enough

    for page in range(2, total+1):
        url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=page)
        next_ = requests.get(url)
        if next_.status_code != requests.codes.ok:
            break
        docs.append(next_)

    return itertools.chain.from_iterable(map(lambda x: parse_doc(x, days=days, max_price=max_price, **kwargs), docs))


def mail(subj, mess, to_, from_='bazarbot@starenka.net'):
    msg = MIMEMultipart('alternative')
    p1 = MIMEText(mess, 'plain', 'utf-8')
    msg['Subject'] = subj
    msg['From'] = from_
    msg['To'] = to_

    msg.attach(p1)

    s = smtplib.SMTP('localhost')
    s.sendmail(from_, [to_], msg.as_string().encode('ascii'))
    s.quit()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('terms', nargs="+", help='search terms')
    # parser.add_argument('-c', '--category', default='any', choices=URL_CATEGORIES.keys(),
    #                    help='category to search (defaults to all)')
    #parser.add_argument('-t', '--ad_type', default='sell', choices=AD_TYPES.keys(), help='ad type')
    parser.add_argument('-p', '--max_price', type=int, default=0, help='max price')
    parser.add_argument('-m', '--mail', default=False, help='mail to',)
    parser.add_argument('-s', '--short', action='store_true', default=False, help='short listing',)
    parser.add_argument('-d', '--days', type=int, default=0, help='just ads within [days]')

    args = parser.parse_args()

    body = ''
    for term in args.terms:
        for one in search(term, **vars(args)):
            title, link, text, img, price, loc_data = one
            body += ' '.join((title, price, loc_data))
            if not args.short:
                body += '\n%s' % text
            body += '\n%s\n\n' % ' '.join((link, img if img else ''))

    if body:
        if args.mail:
            subj = u'bazarbot "%s"' % ' OR '.join(args.terms)
            subj = subj if not args.max_price else subj + (u' < %d Kč' % args.max_price)
            mail(subj, body, to_=args.mail)
        else:
            print body
	#!/usr/bin/env python
	# coding=utf-8

	# pip install requests beautifulsoup4
	# TODO: handle category selection (fucked up serverside)
	# fix seldom weird price detection

	from __future__ import division
	import re
	import datetime
	import argparse
	import itertools
	import smtplib
	from email.mime.text import MIMEText
	from email.mime.multipart import MIMEMultipart

	import requests
	from bs4 import BeautifulSoup

	PER_PAGE = 50
	URL_BASE = 'http://hudebnibazar.cz'
	URL_SEARCH = '?is=1&p={page}&f={term}&n={ad_type}&r=&i=%d&o=datum' % PER_PAGE
	URL_CATEGORIES = {'guitarother': '/kytarova-pouzdra-a-prislusenstvi/110600/',
	'guitarfx': '/kytarove-efekty/110500/',
	'any': '/vsechny-kategorie/0/', }
	AD_TYPES = dict(sell='nabidka', buy='poptavka', other='ruzne')
	MONTHS = (u'ledna', u'února', u'března', u'dubna', u'května', u'června',
	u'července', u'srpna', u'září', u'října', u'listopadu', u'prosince')
	MONTHS_MAP = dict(zip(MONTHS, range(1, 13)))


	def parse_date(date_str):
	today = datetime.date.today()
	date = today if 'dnes' in date_str else today-datetime.timedelta(days=1)
	sdate = re.search(r'(\d+)\.\s+(\w+)', date_str, re.UNICODE)

	if sdate:
	day, month = sdate.groups()
	month_num = MONTHS_MAP[month]
	year = today.year if month_num < today.month else today.year-1
	date = datetime.date(year, month_num, int(day))

	return date


	def parse_doc(resp, days, max_price, **kwargs):
	doc = BeautifulSoup(resp.content)
	for one in doc.select('td.InzeratBody'):
	loc_data_ = one.select('div.InzeratKontakt')
	loc_data = loc_data_[0].text if loc_data_ else ''

	date = parse_date(loc_data)

	if days and (datetime.date.today() - date).days > days:
	continue

	img_ = one.select('a.fancybox')
	img = img_[0]['href'] if img_ else None
	title_ = one.select('div.InzeratNadpisSmall')[0]
	title, link = title_.text, URL_BASE + title_.find('a')['href']
	text = one.select('div.InzeratTextSmall')[0].text

	price_ = one.select('div.InzeratCenaSmall')
	price = price_[0].text if price_ else '?'
	czkm = re.search(u'(\d+) Kč', price)
	price_czk = int(float(czkm.groups()[0].replace(',', '.'))) if czkm else 0

	if max_price and not czkm:
	continue

	if max_price and price_czk > max_price:
	continue

	yield title, link, text, img, u'%d Kč' % price_czk if czkm else price, loc_data


	def search(term, category='any', ad_type='sell', days=0, max_price=None, **kwargs):
	base_url = URL_BASE + URL_CATEGORIES[category]
	url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=1)
	docs = [requests.get(url)]

	total_ = re.search(r'Celkem\s(\d+)', docs[0].content)
	total = int(total_.groups()[0])//PER_PAGE+1
	total = total if total < 10 else 10 # 500 items should be enough

	for page in range(2, total+1):
	url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=page)
	next_ = requests.get(url)
	if next_.status_code != requests.codes.ok:
	break
	docs.append(next_)

	return itertools.chain.from_iterable(map(lambda x: parse_doc(x, days=days, max_price=max_price, **kwargs), docs))


	def mail(subj, mess, to_, from_='bazarbot@starenka.net'):
	msg = MIMEMultipart('alternative')
	p1 = MIMEText(mess, 'plain', 'utf-8')
	msg['Subject'] = subj
	msg['From'] = from_
	msg['To'] = to_

	msg.attach(p1)

	s = smtplib.SMTP('localhost')
	s.sendmail(from_, [to_], msg.as_string().encode('ascii'))
	s.quit()

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('terms', nargs="+", help='search terms')
	# parser.add_argument('-c', '--category', default='any', choices=URL_CATEGORIES.keys(),
	# help='category to search (defaults to all)')
	#parser.add_argument('-t', '--ad_type', default='sell', choices=AD_TYPES.keys(), help='ad type')
	parser.add_argument('-p', '--max_price', type=int, default=0, help='max price')
	parser.add_argument('-m', '--mail', default=False, help='mail to',)
	parser.add_argument('-s', '--short', action='store_true', default=False, help='short listing',)
	parser.add_argument('-d', '--days', type=int, default=0, help='just ads within [days]')

	args = parser.parse_args()

	body = ''
	for term in args.terms:
	for one in search(term, **vars(args)):
	title, link, text, img, price, loc_data = one
	body += ' '.join((title, price, loc_data))
	if not args.short:
	body += '\n%s' % text
	body += '\n%s\n\n' % ' '.join((link, img if img else ''))

	if body:
	if args.mail:
	subj = u'bazarbot "%s"' % ' OR '.join(args.terms)
	subj = subj if not args.max_price else subj + (u' < %d Kč' % args.max_price)
	mail(subj, body, to_=args.mail)
	else:
	print body