Created
October 2, 2015 12:30
-
-
Save mitallast/8dd6f3019c2f5fb1812d to your computer and use it in GitHub Desktop.
parse guitarplayer.ru commerce.guitars for prs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from grab import Grab | |
import logging | |
import pickledb | |
from urlparse import urlparse, parse_qs | |
# logging.basicConfig(level=logging.DEBUG) | |
db = pickledb.load('guitarplayer.db', False) | |
keywords = ['prs', 'paul', 'reed', 'smith', u'прс'] | |
blacklist = ['gibson', 'lespaul', 'les', 'ibanez'] | |
g = Grab() | |
for i in range(0, 440, 40): | |
g.go("http://forum.guitarplayer.ru/index.php?board=20.%s" % i) | |
for i in g.doc.select('//div/span[@id]/a'): | |
url = i.attr('href') | |
title = i.text() | |
topic = parse_qs(urlparse(url).query)['topic'][0] | |
if db.get(topic) is None: | |
db.set(topic, title) | |
if any(word in title.lower() for word in keywords) and not any(word in title.lower() for word in blacklist): | |
print('%s %s' % (url, title)) | |
db.dump() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment