Skip to content

Instantly share code, notes, and snippets.

@iserko
Created March 14, 2014 16:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iserko/9551817 to your computer and use it in GitHub Desktop.
Save iserko/9551817 to your computer and use it in GitHub Desktop.
A cute little Royal Opera House website parser using Selenium
#!/usr/bin/env python
from email.mime.text import MIMEText
from pyvirtualdisplay.smartdisplay import SmartDisplay
from selenium import webdriver
import difflib
import os.path
import random
import smtplib
import time
def parse_roh():
from_addr = 'root@roh-parser.example.com'
to_addr = 'igor.serko@gmail.com'
# Start sleep
#time.sleep(random.randint(1,60))
display = SmartDisplay(visible=0, size=(1024, 768), bgcolor='white', backend='xvfb')
display.start()
browser = webdriver.Chrome('/usr/bin/chromedriver')
event_ids = [
('19766', u"La Boheme on Sat 9th March"),
('20351', u'Tosca on Sat 16th March'),
('20353', u'Tosca on Sat 23rd March'),
]
output = []
num_diffs = 0
for event_id, event_name in event_ids:
# Sleep between gets
time.sleep(random.randint(2,10))
url = u'http://www.roh.org.uk/events/%s/tickets/syos' % event_id
browser.get(url)
#img = display.waitgrab()
#img.save('img_%s_%s.jpeg' % (event_id, int(time.time())), 'JPEG')
import ipdb
ipdb.set_trace()
sections = browser.find_elements_by_xpath("//li[@class='panel-section on']/div[@class='sections']/ul/li")
full_stat = []
for section in sections:
section_name = section.find_element_by_tag_name('h1').text
prices = section.find_element_by_tag_name('h2').text
status = section.find_element_by_tag_name('h3').text
full_stat.append(u"Section: %s, Price: %s, Status: %s ... URL=%s" % (section_name, prices, status, url))
full_stat = u'\n'.join(full_stat)
old_stat = u''
fname = 'roh_status_%s.txt' % event_id
if os.path.exists(fname):
with open(fname, 'r') as f:
old_stat = f.read().decode('utf-8')
if full_stat != old_stat:
with open(fname, 'w') as f:
f.write(full_stat.encode('utf-8'))
num_diffs += 1
output.append(u"Found differences for event: %s" % event_name)
diff = difflib.ndiff(old_stat.splitlines(), full_stat.splitlines())
output += diff
output.append(u'')
output.append(u'')
browser.quit()
display.stop()
if output:
msg = MIMEText((u'\n'.join(output)).encode('utf-8'), 'plain', 'UTF-8')
msg['Subject'] = 'Royal Opera House - Found %s differences' % num_diffs
msg['From'] = from_addr
msg['To'] = to_addr
s = smtplib.SMTP('localhost')
s.sendmail(from_addr, [to_addr], msg.as_string())
s.quit()
if __name__ == '__main__':
parse_roh()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment