Created
April 9, 2012 22:22
-
-
Save hugorodgerbrown/2347044 to your computer and use it in GitHub Desktop.
Parses event details out from the Hay Festival website
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import json | |
import requests | |
import HTMLParser | |
class FestivalEvent(object): | |
pass | |
def cleanText(txt): | |
return txt.strip().replace(u'\u2013', '-').replace(u'\u2019', '\'') | |
def parsePage(url): | |
html = requests.get(url).text | |
try: | |
soup = BeautifulSoup(html) | |
for div in soup.find_all("div","event"): | |
fe = FestivalEvent() | |
fe.event_title = cleanText(div.h2.a.text) # HACK - gets around bad char in HTML | |
if (div.h3): | |
fe.event_subtitle = cleanText(div.h3.text) | |
if len(div.find_all("p")) > 1: | |
fe.event_blurb = cleanText(div.find_all("p")[1].text) # big assumption made here that the second <p> is the blurb!!! | |
if (div.find("span",{"class","variantprice"})): | |
fe.event_price = cleanText(div.find("span",{"class","variantprice"}).text) | |
else : | |
fe.event_price = "Sold out / Price unavailable" | |
fe.event_url = "https://www.hayfestival.com/" + div.h2.a['href'] | |
fe.event_number = cleanText(div.find("span",{"class","event-number"}).text) | |
fe.event_venue = cleanText(div.find("span",{"class","event-venue"}).text) | |
fe.event_time = cleanText(div.find("span",{"class","event-time"}).text) | |
yield fe | |
except HTMLParser.HTMLParseError, e: | |
print "Unable to parse \"" + url + "\"" | |
print e | |
url2 = "http://e2.truffler.net/[...]" #TODO: use your Truffler.net account details | |
for i in range(1,39): | |
print "Parsing page " + str(i) | |
url = "https://www.hayfestival.com/m-57-hay-festival-2012.aspx?pagenum=%s" % str(i) | |
for event in parsePage(url): | |
event_json = json.dumps(event.__dict__) | |
print "%(num)s: %(date)s :\"%(title)s\" (%(venue)s)" % { | |
"num":event.event_number, | |
"date":event.event_time, | |
"title":event.event_title, | |
"venue":event.event_venue | |
} | |
response = requests.post(url2, data=event_json) | |
if (response.status_code > 299): | |
print response.text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
NB before commenting on this, please bear in mind that it is, by design, a massive hack.