Skip to content

Instantly share code, notes, and snippets.

@starenka starenka/hbazar.py
Last active Aug 29, 2015

Embed
What would you like to do?
#!/usr/bin/env python
# coding=utf-8
# pip install requests beautifulsoup4
# TODO: handle category selection (fucked up serverside)
# fix seldom weird price detection
from __future__ import division
import re
import datetime
import argparse
import itertools
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import requests
from bs4 import BeautifulSoup
PER_PAGE = 50
URL_BASE = 'http://hudebnibazar.cz'
URL_SEARCH = '?is=1&p={page}&f={term}&n={ad_type}&r=&i=%d&o=datum' % PER_PAGE
URL_CATEGORIES = {'guitarother': '/kytarova-pouzdra-a-prislusenstvi/110600/',
'guitarfx': '/kytarove-efekty/110500/',
'any': '/vsechny-kategorie/0/', }
AD_TYPES = dict(sell='nabidka', buy='poptavka', other='ruzne')
MONTHS = (u'ledna', u'února', u'března', u'dubna', u'května', u'června',
u'července', u'srpna', u'září', u'října', u'listopadu', u'prosince')
MONTHS_MAP = dict(zip(MONTHS, range(1, 13)))
def parse_date(date_str):
today = datetime.date.today()
date = today if 'dnes' in date_str else today-datetime.timedelta(days=1)
sdate = re.search(r'(\d+)\.\s+(\w+)', date_str, re.UNICODE)
if sdate:
day, month = sdate.groups()
month_num = MONTHS_MAP[month]
year = today.year if month_num < today.month else today.year-1
date = datetime.date(year, month_num, int(day))
return date
def parse_doc(resp, days, max_price, **kwargs):
doc = BeautifulSoup(resp.content)
for one in doc.select('td.InzeratBody'):
loc_data_ = one.select('div.InzeratKontakt')
loc_data = loc_data_[0].text if loc_data_ else ''
date = parse_date(loc_data)
if days and (datetime.date.today() - date).days > days:
continue
img_ = one.select('a.fancybox')
img = img_[0]['href'] if img_ else None
title_ = one.select('div.InzeratNadpisSmall')[0]
title, link = title_.text, URL_BASE + title_.find('a')['href']
text = one.select('div.InzeratTextSmall')[0].text
price_ = one.select('div.InzeratCenaSmall')
price = price_[0].text if price_ else '?'
czkm = re.search(u'(\d+) Kč', price)
price_czk = int(float(czkm.groups()[0].replace(',', '.'))) if czkm else 0
if max_price and not czkm:
continue
if max_price and price_czk > max_price:
continue
yield title, link, text, img, u'%d' % price_czk if czkm else price, loc_data
def search(term, category='any', ad_type='sell', days=0, max_price=None, **kwargs):
base_url = URL_BASE + URL_CATEGORIES[category]
url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=1)
docs = [requests.get(url)]
total_ = re.search(r'Celkem\s(\d+)', docs[0].content)
total = int(total_.groups()[0])//PER_PAGE+1
total = total if total < 10 else 10 # 500 items should be enough
for page in range(2, total+1):
url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=page)
next_ = requests.get(url)
if next_.status_code != requests.codes.ok:
break
docs.append(next_)
return itertools.chain.from_iterable(map(lambda x: parse_doc(x, days=days, max_price=max_price, **kwargs), docs))
def mail(subj, mess, to_, from_='bazarbot@starenka.net'):
msg = MIMEMultipart('alternative')
p1 = MIMEText(mess, 'plain', 'utf-8')
msg['Subject'] = subj
msg['From'] = from_
msg['To'] = to_
msg.attach(p1)
s = smtplib.SMTP('localhost')
s.sendmail(from_, [to_], msg.as_string().encode('ascii'))
s.quit()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('terms', nargs="+", help='search terms')
# parser.add_argument('-c', '--category', default='any', choices=URL_CATEGORIES.keys(),
# help='category to search (defaults to all)')
#parser.add_argument('-t', '--ad_type', default='sell', choices=AD_TYPES.keys(), help='ad type')
parser.add_argument('-p', '--max_price', type=int, default=0, help='max price')
parser.add_argument('-m', '--mail', default=False, help='mail to',)
parser.add_argument('-s', '--short', action='store_true', default=False, help='short listing',)
parser.add_argument('-d', '--days', type=int, default=0, help='just ads within [days]')
args = parser.parse_args()
body = ''
for term in args.terms:
for one in search(term, **vars(args)):
title, link, text, img, price, loc_data = one
body += ' '.join((title, price, loc_data))
if not args.short:
body += '\n%s' % text
body += '\n%s\n\n' % ' '.join((link, img if img else ''))
if body:
if args.mail:
subj = u'bazarbot "%s"' % ' OR '.join(args.terms)
subj = subj if not args.max_price else subj + (u' < %d' % args.max_price)
mail(subj, body, to_=args.mail)
else:
print body
@starenka

This comment has been minimized.

Copy link
Owner Author

commented Mar 1, 2015

hbazar.py 'dunlop mxr' 'boss' 'whammy' -p1600 -d1 -mstarenka0@gmail.com #mail me any dunlop/boss/whammy hits cheaper then 1600 czk added yesterday (into your crontabz)
hbazar.py 'fender jazzmaster' #show may any ads selling jazzmaster

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.