Skip to content

Instantly share code, notes, and snippets.

@ryllada
Created October 11, 2016 11:34
Show Gist options
  • Save ryllada/eabe23aa2092f3c16d0865640b39359b to your computer and use it in GitHub Desktop.
Save ryllada/eabe23aa2092f3c16d0865640b39359b to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import requests
import bs4
class ScrapingClassBase(object):
DOMAIN = None
STARTING_URL = None
ELEMENTS_CONTAINER = None
ELEMENTS_TO_COLLECT = ()
LINKS_PAGINATION_SELECT = None
SENDING_LEAD_FUNCTION = None
TESTING = False
current_page = None
pages = []
collected_pages = []
results = []
COURSES_FIELDS = (
"course_name", "course_type", "course_modality", "course_price",
"center_name", "center_image", )
def start(self, testing=False):
""" This method begins the scraping process
:param testing (optional)
"""
if testing:
self.TESTING = True
if isinstance(self.STARTING_URL, (list, tuple, )):
self.current_page = self.STARTING_URL[0]
for page in self.STARTING_URL:
self.pages.append(page)
else:
self.current_page = self.STARTING_URL
self.pages.append(self.current_page)
while self.pages:
self.collect_next_page()
return
def collect_next_page(self):
""" This method collects elements from first page in list, save them in
the results property, marks the page as collected and finally calls
post_collect_page and proccess_results methods.
"""
if not self.pages:
return
self.current_page = self.pages[0]
print "Collecting courses page: %s" % self.current_page
url = self.current_page
if self.DOMAIN not in url:
url = "%s%s" % (self.DOMAIN, url, )
if "http" not in url:
url = "http:%s" % url
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "lxml")
containers_in_page = [
element for element in soup.select(self.ELEMENTS_CONTAINER)]
for container in containers_in_page:
new_element = {}
for element in self.ELEMENTS_TO_COLLECT:
select = container.select(element['select'])
if not select:
continue
key = element['key']
if 'target_field' in element.keys():
key = element['target_field']
if element['type'] == 'text':
new_element[key] = unicode(select[0].get_text())
continue
if element['type'] == 'attr':
new_element[key] = unicode(
select[0].attrs.get(element['attr']))
continue
if new_element:
self.results.append(new_element)
self.get_new_pages(soup)
self.pages.remove(self.current_page)
self.collected_pages.append(self.current_page)
self.preproccess_results()
self.proccess_results()
self.results = []
return
def get_new_pages(self, soup):
""" This method collects new pages in pagination.
"""
if not self.LINKS_PAGINATION_SELECT:
return
links = [a.attrs.get('href')
for a in soup.select(self.LINKS_PAGINATION_SELECT)]
for link in links:
if (link == "#" or
link in self.pages or
link in self.collected_pages):
continue
self.pages.append(link)
return
def preproccess_results(self):
""" This method is thought to perform changes on a page's results before
process them. Sometimes it'll be needed, sometimes won't.
When it's needed, this method must to be overwritten in descendant
classes.
"""
pass
def proccess_results(self):
""" This method is in charge of process scraped results: to save to a
database, to make analytics, etc. This method must to be overwritten in
descendant classes.
"""
pass
# ==============================================================================
# Use example:
# class ScrapingEmagisterBase(ScrapingClassBase):
# DOMAIN = 'http://www.emagister.com'
# ELEMENTS_CONTAINER = 'div.items-landings-wrp div.item-landing-wrp'
# ELEMENTS_TO_COLLECT = (
# {'select': 'div.item-landing-data h2 a', 'type': 'attr',
# 'attr': 'title', 'key': 'course_name',
# 'target_field': 'course_name', },
# {'select': 'div.price-box span.price', 'type': 'text',
# 'key': 'course_price', 'target_field': 'course_price', },
# {'select': 'div.item-landing-data span.item-center', 'type': 'text',
# 'key': 'center_name', 'target_field': 'center_name', },
# {'select': 'div.item-landing-logo a img', 'type': 'attr',
# 'attr': 'src', 'key': 'center_image',
# 'target_field': 'center_image', },
# {'select':
# 'div.item-landing-block-top-right ul.course-item-landing-features '
# 'li.icons-label-black-tiny-before',
# 'type': 'text', 'key': 'Tipo', },
# {'select':
# 'div.item-landing-block-top-right ul.course-item-landing-features '
# 'li.icons-check-black-tiny-before',
# 'type': 'text', 'key': 'Nivel', },
# {'select':
# 'div.item-landing-block-top-right ul.course-item-landing-features '
# 'li.icons-map-pointer-black-tiny-before',
# 'type': 'text', 'key': 'Lugar', },
# {'select':
# 'div.item-landing-block-top-right ul.course-item-landing-features '
# 'li.icons-clock-black-tiny-before',
# 'type': 'text', 'key': 'Horas lectivas', },
# {'select':
# 'div.item-landing-block-top-right ul.course-item-landing-features '
# 'li.icons-calendar-black-tiny-before',
# 'type': 'text', 'key': 'Comienzo', },
# )
# LINKS_PAGINATION_SELECT = 'nav.app_search_paginationPlaceholder li a'
# SENDING_LEAD_FUNCTION = 'submit_emagister_lead'
#
# def preproccess_results(self):
# for item in self.results:
# for key in item.keys():
# if isinstance(item[key], (str, unicode, )):
# item[key] = item[key].strip().replace("\n", "")
#
# if "center_image" not in item.keys():
# item["center_image"] = ""
# if "data:" in item["center_image"]:
# item["center_image"] = ""
# if item["center_image"] and "http" not in item["center_image"]:
# item["center_image"] = "http:%s" % item["center_image"]
# return
#
# def proccess_results(self):
# for item in self.results:
# course_info = {
# 'course_name': item.get("course_name", None),
# 'course_type': item.get("course_type", None),
# 'course_modality': item.get("course_modality", None),
# 'course_price': item.get("course_price", None),
# 'university_name': item.get("center_name", None),
# 'university_image': item.get("center_image", None), }
#
# api.save_course(**course_info)
#
# print "Pending: %d pages" % len(self.pages)
# return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment