Skip to content

Instantly share code, notes, and snippets.

@dersphere
Created December 13, 2013 21:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dersphere/7951626 to your computer and use it in GitHub Desktop.
Save dersphere/7951626 to your computer and use it in GitHub Desktop.
unfinished!
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Tristan Fischer (sphere@dersphere.de)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import re
from BeautifulSoup import BeautifulSoup
from urllib2 import urlopen
class CountryURLs(object):
country = ''
base_url = ''
seasons_path = '/guide/episodes/'
episodes_path = '/guide/episodes/season-%d',
random_path = '/full-episodes/random'
@classmethod
def get_urls(cls, country):
for subcls in cls.__subclasses__():
if subcls.country == country:
return subcls()
@property
def seasons_url(self):
return self.base_url + self.seasons_path
@property
def episodes_url(self):
return self.base_url + self.episodes_path
class DenmarkURLs(CountryURLs):
country = 'Denmark'
base_url = 'http://www.southparkstudios.dk'
class FinlandURLs(CountryURLs):
country = 'Finland'
base_url = 'http://www.southparkstudios.fi'
class GermanyURLs(CountryURLs):
country = 'Germany'
base_url = 'http://www.southpark.de'
seasons_path = '/guide/episoden/'
episodes_path = '/guide/episoden/staffel-%d'
random_path = '/alle-episoden/random'
class NetherlandURLs(CountryURLs):
country = 'Netherlands'
base_url = 'http://www.southpark.nl'
class NorwayURLs(CountryURLs):
country = 'Norway'
base_url = 'http://www.southparkstudios.no'
class SwedenURLs(CountryURLs):
country = 'Sweden'
base_url = 'http://www.southparkstudios.se'
class UnitedStatesURLs(CountryURLs):
country = 'UnitedStates'
base_url = 'http://www.southparkstudios.com'
class SouthparkScraper(object):
def __init__(self, country=None):
self.urls = CountryURLs.get_urls(country)
if not self.urls:
raise NotImplementedError
def get_seasons(self):
url = self.urls.seasons_url
tree = self.__get_tree(url)
re_span = re.compile('pagination')
section = tree.find('span', {'class': re_span})
for li in section.findAll('li'):
if li.a:
yield int(li.a.string)
elif li.span:
yield int(li.span.string)
def get_episodes(self, season):
log('get_episodes started')
url = self.urls.episodes_url % int(season)
tree = self.__get_tree(url)
for li in tree.findAll('li', {'class': 'grid_item'}):
image_div = li.find('div', {'class': 'image'})
yield {
'title': image_div.a.string,
'thumb': image_div.img['src'].split('?')[0],
'id': image_div.a.string
}
def get_video_url(self, page_url):
pass
def __get_tree(self, url):
html = self.__get_url(url)
return BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
def __get_url(self, url):
log('__get_url opening url: %s' % url)
try:
html = urlopen(url).read()
except HTTPError, error:
log('__urlopen HTTPError: %s' % error)
raise NetworkError('HTTPError: %s' % error)
log('__get_url got %d bytes' % len(html))
return html
def log(text):
print u'Scraper: %s' % text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment