#!/usr/bin/env python
# coding=utf-8
# pip install requests beautifulsoup4
# TODO: handle category selection (fucked up serverside)
# fix seldom weird price detection
from __future__ import division
import re
import datetime
import argparse
import itertools
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import requests
from bs4 import BeautifulSoup
URL_SEARCH = '?is=1&p={page}&f={term}&n={ad_type}&r=&i=%d&o=datum' % PER_PAGE
URL_CATEGORIES = {'guitarother': '/kytarova-pouzdra-a-prislusenstvi/110600/',
'guitarfx': '/kytarove-efekty/110500/',
'any': '/vsechny-kategorie/0/', }
AD_TYPES = dict(sell='nabidka', buy='poptavka', other='ruzne')
MONTHS = (u'ledna', u'února', u'března', u'dubna', u'května', u'června',
u'července', u'srpna', u'září', u'října', u'listopadu', u'prosince')
MONTHS_MAP = dict(zip(MONTHS, range(1, 13)))
def parse_date(date_str):
today =
date = today if 'dnes' in date_str else today-datetime.timedelta(days=1)
sdate ='(\d+)\.\s+(\w+)', date_str, re.UNICODE)
if sdate:
day, month = sdate.groups()
month_num = MONTHS_MAP[month]
year = today.year if month_num < today.month else today.year-1
date =, month_num, int(day))
return date
def parse_doc(resp, days, max_price, **kwargs):
doc = BeautifulSoup(resp.content)
for one in'td.InzeratBody'):
loc_data_ ='div.InzeratKontakt')
loc_data = loc_data_[0].text if loc_data_ else ''
date = parse_date(loc_data)
if days and ( - date).days > days:
img_ ='a.fancybox')
img = img_[0]['href'] if img_ else None
title_ ='div.InzeratNadpisSmall')[0]
title, link = title_.text, URL_BASE + title_.find('a')['href']
text ='div.InzeratTextSmall')[0].text
price_ ='div.InzeratCenaSmall')
price = price_[0].text if price_ else '?'
czkm ='(\d+) Kč', price)
price_czk = int(float(czkm.groups()[0].replace(',', '.'))) if czkm else 0
if max_price and not czkm:
if max_price and price_czk > max_price:
yield title, link, text, img, u'%d Kč' % price_czk if czkm else price, loc_data
def search(term, category='any', ad_type='sell', days=0, max_price=None, **kwargs):
base_url = URL_BASE + URL_CATEGORIES[category]
url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=1)
docs = [requests.get(url)]
total_ ='Celkem\s(\d+)', docs[0].content)
total = int(total_.groups()[0])//PER_PAGE+1
total = total if total < 10 else 10 # 500 items should be enough
for page in range(2, total+1):
url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=page)
next_ = requests.get(url)
if next_.status_code !=
return itertools.chain.from_iterable(map(lambda x: parse_doc(x, days=days, max_price=max_price, **kwargs), docs))
def mail(subj, mess, to_, from_=''):
msg = MIMEMultipart('alternative')
p1 = MIMEText(mess, 'plain', 'utf-8')
msg['Subject'] = subj
msg['From'] = from_
msg['To'] = to_
s = smtplib.SMTP('localhost')
s.sendmail(from_, [to_], msg.as_string().encode('ascii'))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('terms', nargs="+", help='search terms')
# parser.add_argument('-c', '--category', default='any', choices=URL_CATEGORIES.keys(),
# help='category to search (defaults to all)')
#parser.add_argument('-t', '--ad_type', default='sell', choices=AD_TYPES.keys(), help='ad type')
parser.add_argument('-p', '--max_price', type=int, default=0, help='max price')
parser.add_argument('-m', '--mail', default=False, help='mail to',)
parser.add_argument('-s', '--short', action='store_true', default=False, help='short listing',)
parser.add_argument('-d', '--days', type=int, default=0, help='just ads within [days]')
args = parser.parse_args()
body = ''
for term in args.terms:
for one in search(term, **vars(args)):
title, link, text, img, price, loc_data = one
body += ' '.join((title, price, loc_data))
if not args.short:
body += '\n%s' % text
body += '\n%s\n\n' % ' '.join((link, img if img else ''))
if body:
if args.mail:
subj = u'bazarbot "%s"' % ' OR '.join(args.terms)
subj = subj if not args.max_price else subj + (u' < %d Kč' % args.max_price)
mail(subj, body, to_=args.mail)
print body

