Last active
December 16, 2015 12:29
-
-
Save matiit/5435316 to your computer and use it in GitHub Desktop.
Nice using of exceptions to "break loop more than one level up"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
from pyquery import PyQuery as pq | |
import MySQLdb | |
db = MySQLdb.connect(host="localhost", user="someuser", | |
passwd="somepassword", db="somedatabase", charset='utf8') | |
c = db.cursor() | |
# It explains itself | |
# Of course we can dig it from forum "programmaticaly", | |
# but who cares writing some digits? It looks so old school, doesn't it? | |
forum_ids = [11,113,122,22,84,13,66,109,38,93,92,137,94,20,126,97,98,47,40,\ | |
121,28,107,110,116,163,150,166,19,69,101,102,18,22,16,26,50] | |
for forum_id in forum_ids: | |
try: | |
print "parsing", forum_ids.index(forum_id)+1, "of", len(forum_ids) | |
baseUrl = 'http://someVbulletinForum.com/forumdisplay.php?f='+str(forum_id)+'&sort=views&order=desc'+'&page=1' | |
html = pq(url=baseUrl) | |
tmp = html('.pagenav td.vbmenu_control')[0].text_content() | |
#pages = int(tmp[tmp.find('f')+2:].strip()) | |
pages = int(html('.pagenav td.vbmenu_control')[0].text_content().split('z')[-1].strip()) | |
for page in range(1,pages): | |
print "\t\tPage", page, "of", pages | |
uri = baseUrl+str(forum_id)+'&sort=views&order=desc&page='+str(page) | |
h=pq(url=uri) | |
h = h('table#threadslist tr') | |
for tr in h: | |
tds = tr.getchildren() | |
if len(tds) != 6: | |
# It's a some kind of "header of table" | |
continue | |
viewTd = tds[5] | |
try: | |
# I'm sure that nicer way of doing it is out there :) | |
views = int(viewTd.text_content().encode('iso8859-2').replace('\xa0','')) | |
except ValueError: | |
continue | |
# Some condition | |
if views >= 500: | |
replyTd = tds[4] | |
try: | |
reply = int(replyTd.text_content().encode('iso8859-2').replace('\xa0','')) | |
except: | |
# Table cell can be corrupted (it happens) or there can be "-" instead of some int | |
continue | |
# Another condition | |
if reply <= 10: | |
titleTd = tds[2] | |
# I like that :) | |
titleDirtyText = titleTd.cssselect("a[id*='thread_title_']")[0].text_content() | |
title = titleDirtyText.encode('UTF-8') | |
data = {'title': MySQLdb.escape_string(title), 'views': views, 'reply': reply} | |
# By the way - nice bindings params in python | |
c.execute(""" | |
INSERT INTO topics(name, views, replies) VALUES (%(title)s, '%(views)s', '%(reply)s'); | |
""", data) | |
# Not sure if necessary | |
db.commit() | |
# It's just for "more hack-alike" style in console | |
print title, views, reply | |
else: | |
# Almost like break(2) but much cleaner -> you immediately see where it will go... | |
raise StopIteration | |
# ...I mean here :) | |
except StopIteration: | |
continue | |
# Let's hope that will be executed (if it's a big forum and poor conditions, you can not have that much of patience). | |
c.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment