Created
October 11, 2016 11:34
-
-
Save ryllada/eabe23aa2092f3c16d0865640b39359b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import requests | |
import bs4 | |
class ScrapingClassBase(object): | |
DOMAIN = None | |
STARTING_URL = None | |
ELEMENTS_CONTAINER = None | |
ELEMENTS_TO_COLLECT = () | |
LINKS_PAGINATION_SELECT = None | |
SENDING_LEAD_FUNCTION = None | |
TESTING = False | |
current_page = None | |
pages = [] | |
collected_pages = [] | |
results = [] | |
COURSES_FIELDS = ( | |
"course_name", "course_type", "course_modality", "course_price", | |
"center_name", "center_image", ) | |
def start(self, testing=False): | |
""" This method begins the scraping process | |
:param testing (optional) | |
""" | |
if testing: | |
self.TESTING = True | |
if isinstance(self.STARTING_URL, (list, tuple, )): | |
self.current_page = self.STARTING_URL[0] | |
for page in self.STARTING_URL: | |
self.pages.append(page) | |
else: | |
self.current_page = self.STARTING_URL | |
self.pages.append(self.current_page) | |
while self.pages: | |
self.collect_next_page() | |
return | |
def collect_next_page(self): | |
""" This method collects elements from first page in list, save them in | |
the results property, marks the page as collected and finally calls | |
post_collect_page and proccess_results methods. | |
""" | |
if not self.pages: | |
return | |
self.current_page = self.pages[0] | |
print "Collecting courses page: %s" % self.current_page | |
url = self.current_page | |
if self.DOMAIN not in url: | |
url = "%s%s" % (self.DOMAIN, url, ) | |
if "http" not in url: | |
url = "http:%s" % url | |
response = requests.get(url) | |
soup = bs4.BeautifulSoup(response.text, "lxml") | |
containers_in_page = [ | |
element for element in soup.select(self.ELEMENTS_CONTAINER)] | |
for container in containers_in_page: | |
new_element = {} | |
for element in self.ELEMENTS_TO_COLLECT: | |
select = container.select(element['select']) | |
if not select: | |
continue | |
key = element['key'] | |
if 'target_field' in element.keys(): | |
key = element['target_field'] | |
if element['type'] == 'text': | |
new_element[key] = unicode(select[0].get_text()) | |
continue | |
if element['type'] == 'attr': | |
new_element[key] = unicode( | |
select[0].attrs.get(element['attr'])) | |
continue | |
if new_element: | |
self.results.append(new_element) | |
self.get_new_pages(soup) | |
self.pages.remove(self.current_page) | |
self.collected_pages.append(self.current_page) | |
self.preproccess_results() | |
self.proccess_results() | |
self.results = [] | |
return | |
def get_new_pages(self, soup): | |
""" This method collects new pages in pagination. | |
""" | |
if not self.LINKS_PAGINATION_SELECT: | |
return | |
links = [a.attrs.get('href') | |
for a in soup.select(self.LINKS_PAGINATION_SELECT)] | |
for link in links: | |
if (link == "#" or | |
link in self.pages or | |
link in self.collected_pages): | |
continue | |
self.pages.append(link) | |
return | |
def preproccess_results(self): | |
""" This method is thought to perform changes on a page's results before | |
process them. Sometimes it'll be needed, sometimes won't. | |
When it's needed, this method must to be overwritten in descendant | |
classes. | |
""" | |
pass | |
def proccess_results(self): | |
""" This method is in charge of process scraped results: to save to a | |
database, to make analytics, etc. This method must to be overwritten in | |
descendant classes. | |
""" | |
pass | |
# ============================================================================== | |
# Use example: | |
# class ScrapingEmagisterBase(ScrapingClassBase): | |
# DOMAIN = 'http://www.emagister.com' | |
# ELEMENTS_CONTAINER = 'div.items-landings-wrp div.item-landing-wrp' | |
# ELEMENTS_TO_COLLECT = ( | |
# {'select': 'div.item-landing-data h2 a', 'type': 'attr', | |
# 'attr': 'title', 'key': 'course_name', | |
# 'target_field': 'course_name', }, | |
# {'select': 'div.price-box span.price', 'type': 'text', | |
# 'key': 'course_price', 'target_field': 'course_price', }, | |
# {'select': 'div.item-landing-data span.item-center', 'type': 'text', | |
# 'key': 'center_name', 'target_field': 'center_name', }, | |
# {'select': 'div.item-landing-logo a img', 'type': 'attr', | |
# 'attr': 'src', 'key': 'center_image', | |
# 'target_field': 'center_image', }, | |
# {'select': | |
# 'div.item-landing-block-top-right ul.course-item-landing-features ' | |
# 'li.icons-label-black-tiny-before', | |
# 'type': 'text', 'key': 'Tipo', }, | |
# {'select': | |
# 'div.item-landing-block-top-right ul.course-item-landing-features ' | |
# 'li.icons-check-black-tiny-before', | |
# 'type': 'text', 'key': 'Nivel', }, | |
# {'select': | |
# 'div.item-landing-block-top-right ul.course-item-landing-features ' | |
# 'li.icons-map-pointer-black-tiny-before', | |
# 'type': 'text', 'key': 'Lugar', }, | |
# {'select': | |
# 'div.item-landing-block-top-right ul.course-item-landing-features ' | |
# 'li.icons-clock-black-tiny-before', | |
# 'type': 'text', 'key': 'Horas lectivas', }, | |
# {'select': | |
# 'div.item-landing-block-top-right ul.course-item-landing-features ' | |
# 'li.icons-calendar-black-tiny-before', | |
# 'type': 'text', 'key': 'Comienzo', }, | |
# ) | |
# LINKS_PAGINATION_SELECT = 'nav.app_search_paginationPlaceholder li a' | |
# SENDING_LEAD_FUNCTION = 'submit_emagister_lead' | |
# | |
# def preproccess_results(self): | |
# for item in self.results: | |
# for key in item.keys(): | |
# if isinstance(item[key], (str, unicode, )): | |
# item[key] = item[key].strip().replace("\n", "") | |
# | |
# if "center_image" not in item.keys(): | |
# item["center_image"] = "" | |
# if "data:" in item["center_image"]: | |
# item["center_image"] = "" | |
# if item["center_image"] and "http" not in item["center_image"]: | |
# item["center_image"] = "http:%s" % item["center_image"] | |
# return | |
# | |
# def proccess_results(self): | |
# for item in self.results: | |
# course_info = { | |
# 'course_name': item.get("course_name", None), | |
# 'course_type': item.get("course_type", None), | |
# 'course_modality': item.get("course_modality", None), | |
# 'course_price': item.get("course_price", None), | |
# 'university_name': item.get("center_name", None), | |
# 'university_image': item.get("center_image", None), } | |
# | |
# api.save_course(**course_info) | |
# | |
# print "Pending: %d pages" % len(self.pages) | |
# return | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment