Skip to content

Instantly share code, notes, and snippets.

@mitallast
Created October 2, 2015 12:30
Show Gist options
  • Save mitallast/8dd6f3019c2f5fb1812d to your computer and use it in GitHub Desktop.
Save mitallast/8dd6f3019c2f5fb1812d to your computer and use it in GitHub Desktop.
parse guitarplayer.ru commerce.guitars for prs
# -*- coding: utf-8 -*-
from grab import Grab
import logging
import pickledb
from urlparse import urlparse, parse_qs
# logging.basicConfig(level=logging.DEBUG)
db = pickledb.load('guitarplayer.db', False)
keywords = ['prs', 'paul', 'reed', 'smith', u'прс']
blacklist = ['gibson', 'lespaul', 'les', 'ibanez']
g = Grab()
for i in range(0, 440, 40):
g.go("http://forum.guitarplayer.ru/index.php?board=20.%s" % i)
for i in g.doc.select('//div/span[@id]/a'):
url = i.attr('href')
title = i.text()
topic = parse_qs(urlparse(url).query)['topic'][0]
if db.get(topic) is None:
db.set(topic, title)
if any(word in title.lower() for word in keywords) and not any(word in title.lower() for word in blacklist):
print('%s %s' % (url, title))
db.dump()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment