Skip to content

Instantly share code, notes, and snippets.

@bkeating
Created August 12, 2010 00:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bkeating/520107 to your computer and use it in GitHub Desktop.
Save bkeating/520107 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import datetime as dt
import dateutil.parser as dparser
import urllib2
from BeautifulSoup import BeautifulSoup
print "<?xml version=\"1.0\" encoding=\"utf-8\"?>"
print "<articles>"
# Loop through each prediction and bet URL.
for i in range(1,571):
url = "http://www.longbets.org/" + str(i)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
# We check the title tag to see if we should continue down the page.
pageTitle = soup.html.head.title.renderContents().strip()
if pageTitle == "Long Bets - Bet Not Found":
pass
else:
print " <article>"
print " <url>%s</url>" % (url)
print " <comments>"
# Now we dive into each comment on the page and grab what we need.
for comment in soup.findAll('div', 'post'):
# The last 'post' on every page is not really a comment. Exclude it.
if not comment.h3.string.strip() == "Comments are temporarily closed.":
name = comment.div.a.renderContents().strip()
user_url = "http://longbets.org" + comment.div.a['href']
title = comment.h3.renderContents().strip()
message = title + " " + comment.p.renderContents().strip()
date_str = str(comment.div.contents[5].string)
date = dparser.parse(date_str)
date = date.strftime('%a, %d %b %Y %H:%M:%S')
print " <comment>"
print " <name>%s</name>" % (name)
print " <email></email>"
print " <url>" + user_url + "</url>"
print " <ip_address></ip_address>"
print " <message>%s</message>" % (message)
print " <date>%s -0000</date>" % (date)
print " <points>1</points>"
print " </comment>"
print " </comments>"
print " </article>"
print "</articles>"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment