Skip to content

Instantly share code, notes, and snippets.

@Garciat
Created November 20, 2012 00:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Garciat/4115134 to your computer and use it in GitHub Desktop.
Save Garciat/4115134 to your computer and use it in GitHub Desktop.
Scraper para cinecenter.com.bo
import re
from urllib2 import Request, urlopen
from bs4 import BeautifulSoup as Soup
class Scraper(object):
regions = dict(
lapaz = 'La Paz',
cochabamba = 'Cochabamba',
santacruz = 'Santa Cruz'
)
base_url = 'http://cinecenter.com.bo'
def __init__(self, region):
if region not in self.regions:
raise ValueError('region does not exist')
self.region = region
self.session = None
def init_session(self):
response = urlopen('%s/%s.html' % (self.base_url, self.region))
headers = dict(response.info())
assert 'set-cookie' in headers, 'no cookie header'
header = headers['set-cookie']
cname, cvalue = header.split(';')[0].strip().split('=')
assert cname.strip().upper() == 'PHPSESSID', 'nope?'
self.session = cvalue
assert self.session is not None, 'session cookie could not be fetched'
def session_request(self, *args, **kwargs):
if self.session is None:
self.init_session()
request = Request(*args, **kwargs)
cookie = 'PHPSESSID=' + self.session
if 'Cookie' in request.headers:
cookie = '%s; %s' % (request.headers['Cookie'], cookie)
request.add_header('Cookie', cookie)
return request
def scrape(self):
request = self.session_request(self.base_url + '/index.php?accion=getCartelera')
response = urlopen(request)
dom = Soup(response.read())
title_re = re.compile('^PELICULA')
for movie in dom('td', class_ = 'over_modul'):
img = movie.find('img', title = title_re)
title = img['title'].split('PELICULA:', 1)[-1].strip()
horarios = movie.find('div', class_ = 'div_hint').get_text().strip().split('\n')
normal = horarios[1]
tresd = horarios[3]
vip = horarios[5]
print title.encode('utf-8')
print 'Normal', normal
print '3D', tresd
print
if __name__ == '__main__':
Scraper('santacruz').scrape()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment