ryllada/Scraping classes

## Scraping classes
# -*- coding: utf-8 -*-
import requests
import bs4


class ScrapingClassBase(object):
    DOMAIN = None
    STARTING_URL = None
    ELEMENTS_CONTAINER = None
    ELEMENTS_TO_COLLECT = ()
    LINKS_PAGINATION_SELECT = None
    SENDING_LEAD_FUNCTION = None
    TESTING = False

    current_page = None
    pages = []
    collected_pages = []
    results = []

    COURSES_FIELDS = (
        "course_name", "course_type", "course_modality", "course_price",
        "center_name", "center_image", )

    def start(self, testing=False):
        """ This method begins the scraping process
        :param testing (optional)
        """
        if testing:
            self.TESTING = True

        if isinstance(self.STARTING_URL, (list, tuple, )):
            self.current_page = self.STARTING_URL[0]
            for page in self.STARTING_URL:
                self.pages.append(page)
        else:
            self.current_page = self.STARTING_URL
            self.pages.append(self.current_page)

        while self.pages:
            self.collect_next_page()
        return

    def collect_next_page(self):
        """ This method collects elements from first page in list, save them in
        the results property, marks the page as collected and finally calls
        post_collect_page and proccess_results methods.
        """
        if not self.pages:
            return

        self.current_page = self.pages[0]
        print "Collecting courses page: %s" % self.current_page

        url = self.current_page
        if self.DOMAIN not in url:
            url = "%s%s" % (self.DOMAIN, url, )
        if "http" not in url:
            url = "http:%s" % url

        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text, "lxml")
        containers_in_page = [
            element for element in soup.select(self.ELEMENTS_CONTAINER)]

        for container in containers_in_page:
            new_element = {}

            for element in self.ELEMENTS_TO_COLLECT:
                select = container.select(element['select'])
                if not select:
                    continue

                key = element['key']
                if 'target_field' in element.keys():
                    key = element['target_field']

                if element['type'] == 'text':
                    new_element[key] = unicode(select[0].get_text())
                    continue

                if element['type'] == 'attr':
                    new_element[key] = unicode(
                        select[0].attrs.get(element['attr']))
                    continue

            if new_element:
                self.results.append(new_element)

        self.get_new_pages(soup)
        self.pages.remove(self.current_page)
        self.collected_pages.append(self.current_page)

        self.preproccess_results()
        self.proccess_results()
        self.results = []
        return

    def get_new_pages(self, soup):
        """ This method collects new pages in pagination.
        """
        if not self.LINKS_PAGINATION_SELECT:
            return

        links = [a.attrs.get('href')
                 for a in soup.select(self.LINKS_PAGINATION_SELECT)]
        for link in links:
            if (link == "#" or
                    link in self.pages or
                    link in self.collected_pages):
                continue
            self.pages.append(link)
        return

    def preproccess_results(self):
        """ This method is thought to perform changes on a page's results before
        process them. Sometimes it'll be needed, sometimes won't.
        When it's needed, this method must to be overwritten in descendant
        classes.
        """
        pass

    def proccess_results(self):
        """ This method is in charge of process scraped results: to save to a
        database, to make analytics, etc. This method must to be overwritten in
        descendant classes.
        """
        pass


# ==============================================================================
# Use example:

# class ScrapingEmagisterBase(ScrapingClassBase):
#     DOMAIN = 'http://www.emagister.com'
#     ELEMENTS_CONTAINER = 'div.items-landings-wrp div.item-landing-wrp'
#     ELEMENTS_TO_COLLECT = (
#         {'select': 'div.item-landing-data h2 a', 'type': 'attr',
#          'attr': 'title', 'key': 'course_name',
#          'target_field': 'course_name', },
#         {'select': 'div.price-box span.price', 'type': 'text',
#          'key': 'course_price', 'target_field': 'course_price', },
#         {'select': 'div.item-landing-data span.item-center', 'type': 'text',
#          'key': 'center_name', 'target_field': 'center_name', },
#         {'select': 'div.item-landing-logo a img', 'type': 'attr',
#          'attr': 'src', 'key': 'center_image',
#          'target_field': 'center_image', },
#         {'select':
#             'div.item-landing-block-top-right ul.course-item-landing-features '
#             'li.icons-label-black-tiny-before',
#          'type': 'text', 'key': 'Tipo', },
#         {'select':
#             'div.item-landing-block-top-right ul.course-item-landing-features '
#             'li.icons-check-black-tiny-before',
#          'type': 'text', 'key': 'Nivel', },
#         {'select':
#             'div.item-landing-block-top-right ul.course-item-landing-features '
#             'li.icons-map-pointer-black-tiny-before',
#          'type': 'text', 'key': 'Lugar', },
#         {'select':
#             'div.item-landing-block-top-right ul.course-item-landing-features '
#             'li.icons-clock-black-tiny-before',
#          'type': 'text', 'key': 'Horas lectivas', },
#         {'select':
#             'div.item-landing-block-top-right ul.course-item-landing-features '
#             'li.icons-calendar-black-tiny-before',
#          'type': 'text', 'key': 'Comienzo', },
#     )
#     LINKS_PAGINATION_SELECT = 'nav.app_search_paginationPlaceholder li a'
#     SENDING_LEAD_FUNCTION = 'submit_emagister_lead'
#
#     def preproccess_results(self):
#         for item in self.results:
#             for key in item.keys():
#                 if isinstance(item[key], (str, unicode, )):
#                     item[key] = item[key].strip().replace("\n", "")
#
#             if "center_image" not in item.keys():
#                 item["center_image"] = ""
#             if "data:" in item["center_image"]:
#                 item["center_image"] = ""
#             if item["center_image"] and "http" not in item["center_image"]:
#                 item["center_image"] = "http:%s" % item["center_image"]
#         return
#
#     def proccess_results(self):
#         for item in self.results:
#             course_info = {
#                 'course_name': item.get("course_name", None),
#                 'course_type': item.get("course_type", None),
#                 'course_modality': item.get("course_modality", None),
#                 'course_price': item.get("course_price", None),
#                 'university_name': item.get("center_name", None),
#                 'university_image': item.get("center_image", None), }
#
#             api.save_course(**course_info)
#
#         print "Pending: %d pages" % len(self.pages)
#         return
	# -- coding: utf-8 --
	import requests
	import bs4


	class ScrapingClassBase(object):
	DOMAIN = None
	STARTING_URL = None
	ELEMENTS_CONTAINER = None
	ELEMENTS_TO_COLLECT = ()
	LINKS_PAGINATION_SELECT = None
	SENDING_LEAD_FUNCTION = None
	TESTING = False

	current_page = None
	pages = []
	collected_pages = []
	results = []

	COURSES_FIELDS = (
	"course_name", "course_type", "course_modality", "course_price",
	"center_name", "center_image", )

	def start(self, testing=False):
	""" This method begins the scraping process
	:param testing (optional)
	"""
	if testing:
	self.TESTING = True

	if isinstance(self.STARTING_URL, (list, tuple, )):
	self.current_page = self.STARTING_URL[0]
	for page in self.STARTING_URL:
	self.pages.append(page)
	else:
	self.current_page = self.STARTING_URL
	self.pages.append(self.current_page)

	while self.pages:
	self.collect_next_page()
	return

	def collect_next_page(self):
	""" This method collects elements from first page in list, save them in
	the results property, marks the page as collected and finally calls
	post_collect_page and proccess_results methods.
	"""
	if not self.pages:
	return

	self.current_page = self.pages[0]
	print "Collecting courses page: %s" % self.current_page

	url = self.current_page
	if self.DOMAIN not in url:
	url = "%s%s" % (self.DOMAIN, url, )
	if "http" not in url:
	url = "http:%s" % url

	response = requests.get(url)
	soup = bs4.BeautifulSoup(response.text, "lxml")
	containers_in_page = [
	element for element in soup.select(self.ELEMENTS_CONTAINER)]

	for container in containers_in_page:
	new_element = {}

	for element in self.ELEMENTS_TO_COLLECT:
	select = container.select(element['select'])
	if not select:
	continue

	key = element['key']
	if 'target_field' in element.keys():
	key = element['target_field']

	if element['type'] == 'text':
	new_element[key] = unicode(select[0].get_text())
	continue

	if element['type'] == 'attr':
	new_element[key] = unicode(
	select[0].attrs.get(element['attr']))
	continue

	if new_element:
	self.results.append(new_element)

	self.get_new_pages(soup)
	self.pages.remove(self.current_page)
	self.collected_pages.append(self.current_page)

	self.preproccess_results()
	self.proccess_results()
	self.results = []
	return

	def get_new_pages(self, soup):
	""" This method collects new pages in pagination.
	"""
	if not self.LINKS_PAGINATION_SELECT:
	return

	links = [a.attrs.get('href')
	for a in soup.select(self.LINKS_PAGINATION_SELECT)]
	for link in links:
	if (link == "#" or
	link in self.pages or
	link in self.collected_pages):
	continue
	self.pages.append(link)
	return

	def preproccess_results(self):
	""" This method is thought to perform changes on a page's results before
	process them. Sometimes it'll be needed, sometimes won't.
	When it's needed, this method must to be overwritten in descendant
	classes.
	"""
	pass

	def proccess_results(self):
	""" This method is in charge of process scraped results: to save to a
	database, to make analytics, etc. This method must to be overwritten in
	descendant classes.
	"""
	pass


	# ==============================================================================
	# Use example:

	# class ScrapingEmagisterBase(ScrapingClassBase):
	# DOMAIN = 'http://www.emagister.com'
	# ELEMENTS_CONTAINER = 'div.items-landings-wrp div.item-landing-wrp'
	# ELEMENTS_TO_COLLECT = (
	# {'select': 'div.item-landing-data h2 a', 'type': 'attr',
	# 'attr': 'title', 'key': 'course_name',
	# 'target_field': 'course_name', },
	# {'select': 'div.price-box span.price', 'type': 'text',
	# 'key': 'course_price', 'target_field': 'course_price', },
	# {'select': 'div.item-landing-data span.item-center', 'type': 'text',
	# 'key': 'center_name', 'target_field': 'center_name', },
	# {'select': 'div.item-landing-logo a img', 'type': 'attr',
	# 'attr': 'src', 'key': 'center_image',
	# 'target_field': 'center_image', },
	# {'select':
	# 'div.item-landing-block-top-right ul.course-item-landing-features '
	# 'li.icons-label-black-tiny-before',
	# 'type': 'text', 'key': 'Tipo', },
	# {'select':
	# 'div.item-landing-block-top-right ul.course-item-landing-features '
	# 'li.icons-check-black-tiny-before',
	# 'type': 'text', 'key': 'Nivel', },
	# {'select':
	# 'div.item-landing-block-top-right ul.course-item-landing-features '
	# 'li.icons-map-pointer-black-tiny-before',
	# 'type': 'text', 'key': 'Lugar', },
	# {'select':
	# 'div.item-landing-block-top-right ul.course-item-landing-features '
	# 'li.icons-clock-black-tiny-before',
	# 'type': 'text', 'key': 'Horas lectivas', },
	# {'select':
	# 'div.item-landing-block-top-right ul.course-item-landing-features '
	# 'li.icons-calendar-black-tiny-before',
	# 'type': 'text', 'key': 'Comienzo', },
	# )
	# LINKS_PAGINATION_SELECT = 'nav.app_search_paginationPlaceholder li a'
	# SENDING_LEAD_FUNCTION = 'submit_emagister_lead'
	#
	# def preproccess_results(self):
	# for item in self.results:
	# for key in item.keys():
	# if isinstance(item[key], (str, unicode, )):
	# item[key] = item[key].strip().replace("\n", "")
	#
	# if "center_image" not in item.keys():
	# item["center_image"] = ""
	# if "data:" in item["center_image"]:
	# item["center_image"] = ""
	# if item["center_image"] and "http" not in item["center_image"]:
	# item["center_image"] = "http:%s" % item["center_image"]
	# return
	#
	# def proccess_results(self):
	# for item in self.results:
	# course_info = {
	# 'course_name': item.get("course_name", None),
	# 'course_type': item.get("course_type", None),
	# 'course_modality': item.get("course_modality", None),
	# 'course_price': item.get("course_price", None),
	# 'university_name': item.get("center_name", None),
	# 'university_image': item.get("center_image", None), }
	#
	# api.save_course(**course_info)
	#
	# print "Pending: %d pages" % len(self.pages)
	# return