Skip to content

Instantly share code, notes, and snippets.

@matiit
Last active December 16, 2015 12:29
Show Gist options
  • Save matiit/5435316 to your computer and use it in GitHub Desktop.
Save matiit/5435316 to your computer and use it in GitHub Desktop.
Nice using of exceptions to "break loop more than one level up"
#-*- coding: utf-8 -*-
from pyquery import PyQuery as pq
import MySQLdb
db = MySQLdb.connect(host="localhost", user="someuser",
passwd="somepassword", db="somedatabase", charset='utf8')
c = db.cursor()
# It explains itself
# Of course we can dig it from forum "programmaticaly",
# but who cares writing some digits? It looks so old school, doesn't it?
forum_ids = [11,113,122,22,84,13,66,109,38,93,92,137,94,20,126,97,98,47,40,\
121,28,107,110,116,163,150,166,19,69,101,102,18,22,16,26,50]
for forum_id in forum_ids:
try:
print "parsing", forum_ids.index(forum_id)+1, "of", len(forum_ids)
baseUrl = 'http://someVbulletinForum.com/forumdisplay.php?f='+str(forum_id)+'&sort=views&order=desc'+'&page=1'
html = pq(url=baseUrl)
tmp = html('.pagenav td.vbmenu_control')[0].text_content()
#pages = int(tmp[tmp.find('f')+2:].strip())
pages = int(html('.pagenav td.vbmenu_control')[0].text_content().split('z')[-1].strip())
for page in range(1,pages):
print "\t\tPage", page, "of", pages
uri = baseUrl+str(forum_id)+'&sort=views&order=desc&page='+str(page)
h=pq(url=uri)
h = h('table#threadslist tr')
for tr in h:
tds = tr.getchildren()
if len(tds) != 6:
# It's a some kind of "header of table"
continue
viewTd = tds[5]
try:
# I'm sure that nicer way of doing it is out there :)
views = int(viewTd.text_content().encode('iso8859-2').replace('\xa0',''))
except ValueError:
continue
# Some condition
if views >= 500:
replyTd = tds[4]
try:
reply = int(replyTd.text_content().encode('iso8859-2').replace('\xa0',''))
except:
# Table cell can be corrupted (it happens) or there can be "-" instead of some int
continue
# Another condition
if reply <= 10:
titleTd = tds[2]
# I like that :)
titleDirtyText = titleTd.cssselect("a[id*='thread_title_']")[0].text_content()
title = titleDirtyText.encode('UTF-8')
data = {'title': MySQLdb.escape_string(title), 'views': views, 'reply': reply}
# By the way - nice bindings params in python
c.execute("""
INSERT INTO topics(name, views, replies) VALUES (%(title)s, '%(views)s', '%(reply)s');
""", data)
# Not sure if necessary
db.commit()
# It's just for "more hack-alike" style in console
print title, views, reply
else:
# Almost like break(2) but much cleaner -> you immediately see where it will go...
raise StopIteration
# ...I mean here :)
except StopIteration:
continue
# Let's hope that will be executed (if it's a big forum and poor conditions, you can not have that much of patience).
c.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment