Skip to content

Instantly share code, notes, and snippets.

@hugorodgerbrown
Created April 9, 2012 22:22
Show Gist options
  • Save hugorodgerbrown/2347044 to your computer and use it in GitHub Desktop.
Save hugorodgerbrown/2347044 to your computer and use it in GitHub Desktop.
Parses event details out from the Hay Festival website
from bs4 import BeautifulSoup
import json
import requests
import HTMLParser
class FestivalEvent(object):
pass
def cleanText(txt):
return txt.strip().replace(u'\u2013', '-').replace(u'\u2019', '\'')
def parsePage(url):
html = requests.get(url).text
try:
soup = BeautifulSoup(html)
for div in soup.find_all("div","event"):
fe = FestivalEvent()
fe.event_title = cleanText(div.h2.a.text) # HACK - gets around bad char in HTML
if (div.h3):
fe.event_subtitle = cleanText(div.h3.text)
if len(div.find_all("p")) > 1:
fe.event_blurb = cleanText(div.find_all("p")[1].text) # big assumption made here that the second <p> is the blurb!!!
if (div.find("span",{"class","variantprice"})):
fe.event_price = cleanText(div.find("span",{"class","variantprice"}).text)
else :
fe.event_price = "Sold out / Price unavailable"
fe.event_url = "https://www.hayfestival.com/" + div.h2.a['href']
fe.event_number = cleanText(div.find("span",{"class","event-number"}).text)
fe.event_venue = cleanText(div.find("span",{"class","event-venue"}).text)
fe.event_time = cleanText(div.find("span",{"class","event-time"}).text)
yield fe
except HTMLParser.HTMLParseError, e:
print "Unable to parse \"" + url + "\""
print e
url2 = "http://e2.truffler.net/[...]" #TODO: use your Truffler.net account details
for i in range(1,39):
print "Parsing page " + str(i)
url = "https://www.hayfestival.com/m-57-hay-festival-2012.aspx?pagenum=%s" % str(i)
for event in parsePage(url):
event_json = json.dumps(event.__dict__)
print "%(num)s: %(date)s :\"%(title)s\" (%(venue)s)" % {
"num":event.event_number,
"date":event.event_time,
"title":event.event_title,
"venue":event.event_venue
}
response = requests.post(url2, data=event_json)
if (response.status_code > 299):
print response.text
@hugorodgerbrown
Copy link
Author

NB before commenting on this, please bear in mind that it is, by design, a massive hack.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment