Skip to content

Instantly share code, notes, and snippets.

@starenka
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save starenka/d3fce9886dc67bc48124 to your computer and use it in GitHub Desktop.
Save starenka/d3fce9886dc67bc48124 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding=utf-8
# pip install requests beautifulsoup4
# TODO: handle category selection (fucked up serverside)
# fix seldom weird price detection
from __future__ import division
import re
import datetime
import argparse
import itertools
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import requests
from bs4 import BeautifulSoup
PER_PAGE = 50
URL_BASE = 'http://hudebnibazar.cz'
URL_SEARCH = '?is=1&p={page}&f={term}&n={ad_type}&r=&i=%d&o=datum' % PER_PAGE
URL_CATEGORIES = {'guitarother': '/kytarova-pouzdra-a-prislusenstvi/110600/',
'guitarfx': '/kytarove-efekty/110500/',
'any': '/vsechny-kategorie/0/', }
AD_TYPES = dict(sell='nabidka', buy='poptavka', other='ruzne')
MONTHS = (u'ledna', u'února', u'března', u'dubna', u'května', u'června',
u'července', u'srpna', u'září', u'října', u'listopadu', u'prosince')
MONTHS_MAP = dict(zip(MONTHS, range(1, 13)))
def parse_date(date_str):
today = datetime.date.today()
date = today if 'dnes' in date_str else today-datetime.timedelta(days=1)
sdate = re.search(r'(\d+)\.\s+(\w+)', date_str, re.UNICODE)
if sdate:
day, month = sdate.groups()
month_num = MONTHS_MAP[month]
year = today.year if month_num < today.month else today.year-1
date = datetime.date(year, month_num, int(day))
return date
def parse_doc(resp, days, max_price, **kwargs):
doc = BeautifulSoup(resp.content)
for one in doc.select('td.InzeratBody'):
loc_data_ = one.select('div.InzeratKontakt')
loc_data = loc_data_[0].text if loc_data_ else ''
date = parse_date(loc_data)
if days and (datetime.date.today() - date).days > days:
continue
img_ = one.select('a.fancybox')
img = img_[0]['href'] if img_ else None
title_ = one.select('div.InzeratNadpisSmall')[0]
title, link = title_.text, URL_BASE + title_.find('a')['href']
text = one.select('div.InzeratTextSmall')[0].text
price_ = one.select('div.InzeratCenaSmall')
price = price_[0].text if price_ else '?'
czkm = re.search(u'(\d+) Kč', price)
price_czk = int(float(czkm.groups()[0].replace(',', '.'))) if czkm else 0
if max_price and not czkm:
continue
if max_price and price_czk > max_price:
continue
yield title, link, text, img, u'%d Kč' % price_czk if czkm else price, loc_data
def search(term, category='any', ad_type='sell', days=0, max_price=None, **kwargs):
base_url = URL_BASE + URL_CATEGORIES[category]
url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=1)
docs = [requests.get(url)]
total_ = re.search(r'Celkem\s(\d+)', docs[0].content)
total = int(total_.groups()[0])//PER_PAGE+1
total = total if total < 10 else 10 # 500 items should be enough
for page in range(2, total+1):
url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=page)
next_ = requests.get(url)
if next_.status_code != requests.codes.ok:
break
docs.append(next_)
return itertools.chain.from_iterable(map(lambda x: parse_doc(x, days=days, max_price=max_price, **kwargs), docs))
def mail(subj, mess, to_, from_='bazarbot@starenka.net'):
msg = MIMEMultipart('alternative')
p1 = MIMEText(mess, 'plain', 'utf-8')
msg['Subject'] = subj
msg['From'] = from_
msg['To'] = to_
msg.attach(p1)
s = smtplib.SMTP('localhost')
s.sendmail(from_, [to_], msg.as_string().encode('ascii'))
s.quit()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('terms', nargs="+", help='search terms')
# parser.add_argument('-c', '--category', default='any', choices=URL_CATEGORIES.keys(),
# help='category to search (defaults to all)')
#parser.add_argument('-t', '--ad_type', default='sell', choices=AD_TYPES.keys(), help='ad type')
parser.add_argument('-p', '--max_price', type=int, default=0, help='max price')
parser.add_argument('-m', '--mail', default=False, help='mail to',)
parser.add_argument('-s', '--short', action='store_true', default=False, help='short listing',)
parser.add_argument('-d', '--days', type=int, default=0, help='just ads within [days]')
args = parser.parse_args()
body = ''
for term in args.terms:
for one in search(term, **vars(args)):
title, link, text, img, price, loc_data = one
body += ' '.join((title, price, loc_data))
if not args.short:
body += '\n%s' % text
body += '\n%s\n\n' % ' '.join((link, img if img else ''))
if body:
if args.mail:
subj = u'bazarbot "%s"' % ' OR '.join(args.terms)
subj = subj if not args.max_price else subj + (u' < %d Kč' % args.max_price)
mail(subj, body, to_=args.mail)
else:
print body
@starenka
Copy link
Author

starenka commented Mar 1, 2015

hbazar.py 'dunlop mxr' 'boss' 'whammy' -p1600 -d1 -mstarenka0@gmail.com #mail me any dunlop/boss/whammy hits cheaper then 1600 czk added yesterday (into your crontabz)
hbazar.py 'fender jazzmaster' #show may any ads selling jazzmaster

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment