Created
December 13, 2013 21:28
-
-
Save dersphere/7951626 to your computer and use it in GitHub Desktop.
unfinished!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright (C) 2012 Tristan Fischer (sphere@dersphere.de) | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
# | |
import re | |
from BeautifulSoup import BeautifulSoup | |
from urllib2 import urlopen | |
class CountryURLs(object): | |
country = '' | |
base_url = '' | |
seasons_path = '/guide/episodes/' | |
episodes_path = '/guide/episodes/season-%d', | |
random_path = '/full-episodes/random' | |
@classmethod | |
def get_urls(cls, country): | |
for subcls in cls.__subclasses__(): | |
if subcls.country == country: | |
return subcls() | |
@property | |
def seasons_url(self): | |
return self.base_url + self.seasons_path | |
@property | |
def episodes_url(self): | |
return self.base_url + self.episodes_path | |
class DenmarkURLs(CountryURLs): | |
country = 'Denmark' | |
base_url = 'http://www.southparkstudios.dk' | |
class FinlandURLs(CountryURLs): | |
country = 'Finland' | |
base_url = 'http://www.southparkstudios.fi' | |
class GermanyURLs(CountryURLs): | |
country = 'Germany' | |
base_url = 'http://www.southpark.de' | |
seasons_path = '/guide/episoden/' | |
episodes_path = '/guide/episoden/staffel-%d' | |
random_path = '/alle-episoden/random' | |
class NetherlandURLs(CountryURLs): | |
country = 'Netherlands' | |
base_url = 'http://www.southpark.nl' | |
class NorwayURLs(CountryURLs): | |
country = 'Norway' | |
base_url = 'http://www.southparkstudios.no' | |
class SwedenURLs(CountryURLs): | |
country = 'Sweden' | |
base_url = 'http://www.southparkstudios.se' | |
class UnitedStatesURLs(CountryURLs): | |
country = 'UnitedStates' | |
base_url = 'http://www.southparkstudios.com' | |
class SouthparkScraper(object): | |
def __init__(self, country=None): | |
self.urls = CountryURLs.get_urls(country) | |
if not self.urls: | |
raise NotImplementedError | |
def get_seasons(self): | |
url = self.urls.seasons_url | |
tree = self.__get_tree(url) | |
re_span = re.compile('pagination') | |
section = tree.find('span', {'class': re_span}) | |
for li in section.findAll('li'): | |
if li.a: | |
yield int(li.a.string) | |
elif li.span: | |
yield int(li.span.string) | |
def get_episodes(self, season): | |
log('get_episodes started') | |
url = self.urls.episodes_url % int(season) | |
tree = self.__get_tree(url) | |
for li in tree.findAll('li', {'class': 'grid_item'}): | |
image_div = li.find('div', {'class': 'image'}) | |
yield { | |
'title': image_div.a.string, | |
'thumb': image_div.img['src'].split('?')[0], | |
'id': image_div.a.string | |
} | |
def get_video_url(self, page_url): | |
pass | |
def __get_tree(self, url): | |
html = self.__get_url(url) | |
return BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) | |
def __get_url(self, url): | |
log('__get_url opening url: %s' % url) | |
try: | |
html = urlopen(url).read() | |
except HTTPError, error: | |
log('__urlopen HTTPError: %s' % error) | |
raise NetworkError('HTTPError: %s' % error) | |
log('__get_url got %d bytes' % len(html)) | |
return html | |
def log(text): | |
print u'Scraper: %s' % text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment