pawelpolewicz/colombia.py

## colombia.py
import logging
import re

from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

from trounceflow.spiders.exceptions import SpiderCannotGetDownloadURLError
from trounceflow.spiders.impl.countries.base import CountrySpider
from trounceflow.utils.selenium import SeleniumDriverWrapper

logger = logging.getLogger(__name__)

CLICKABLE = expected_conditions.element_to_be_clickable

SPANISH_MONTH_SPELLING = ('Ene', 'Feb', 'Mar', 'Abr', 'May', 'Jun', 'Jul',
                          'Ago', 'Sep', 'Oct', 'Nov', 'Dic')


def wait(driver, locator, timeout=20):
    try:
        logger.info('Waiting for locator {}'.format(locator))
        return WebDriverWait(driver, timeout).until(
            expected_conditions.element_to_be_clickable(locator))
    except TimeoutException:
        raise SpiderCannotGetDownloadURLError(
            'Could not get locator {}'.format(locator))


class ColombiaSpider(CountrySpider):
    URL = 'http://www.irc.gov.co/'

    LINK_REGEX = '{year}\s+Profile([\s-]*)({month}|{spanish_month})'
    YEAR_TABS_CLASS = 'xzb'
    FILE_LINK_CLASS = 'documento_titulo_vinculo'
    HISTORICAL_BEFORE_YEAR = 2011

    file_extension = 'pdf'
    country_name = 'Colombia'

    def get_download_url(self, date):
        if date.year < self.HISTORICAL_BEFORE_YEAR:
            tab_name = 'Historical'
        else:
            tab_name = str(date.year)

        page_source = self.get_file_list_page(tab_name, date.year)

        soup = BeautifulSoup(page_source, 'html.parser')
        file_link_re = re.compile(self.LINK_REGEX.format(
            year=date.year, month=date.strftime('%b'),
            spanish_month=SPANISH_MONTH_SPELLING[date.month - 1]))
        element = soup.find('a', class_=self.FILE_LINK_CLASS, text=file_link_re)
        if element is None:
            raise SpiderCannotGetDownloadURLError('Could not find file link')
        return element.attrs['href']

    def get_file_list_page(self, tab_name, year):
        with self.selenium_driver() as driver:
            try:
                driver.get(self.URL)
                wrapper = SeleniumDriverWrapper(driver)
                wrapper.wait_link_text('English Version', CLICKABLE).click()
                wrapper.wait_link_text('Central Government Public Debt '
                                       'Information', CLICKABLE).click()
                wrapper.wait_link_text('Public Debt Statistic Information',
                                       CLICKABLE).click()
                wrapper.wait_link_text('Historical Profile', CLICKABLE).click()
                wrapper.wait_class_name(self.YEAR_TABS_CLASS, CLICKABLE)
                self.get_tab(driver, tab_name).click()
                wrapper.wait_partial_link_text('{} Profile'.format(year),
                                               CLICKABLE).click()
                return driver.page_source
            except TimeoutException as e:
                raise SpiderCannotGetDownloadURLError(e.msg)

    def get_tab(self, driver, year_tab_name):
        for tab in driver.find_elements_by_class_name(self.YEAR_TABS_CLASS):
            if tab.text == year_tab_name:
                return tab
        else:
            raise SpiderCannotGetDownloadURLError(
                'Could not find year tab name {}'.format(year_tab_name))
	import logging
	import re

	from bs4 import BeautifulSoup
	from selenium.common.exceptions import TimeoutException
	from selenium.webdriver.support import expected_conditions
	from selenium.webdriver.support.wait import WebDriverWait

	from trounceflow.spiders.exceptions import SpiderCannotGetDownloadURLError
	from trounceflow.spiders.impl.countries.base import CountrySpider
	from trounceflow.utils.selenium import SeleniumDriverWrapper

	logger = logging.getLogger(__name__)

	CLICKABLE = expected_conditions.element_to_be_clickable

	SPANISH_MONTH_SPELLING = ('Ene', 'Feb', 'Mar', 'Abr', 'May', 'Jun', 'Jul',
	'Ago', 'Sep', 'Oct', 'Nov', 'Dic')


	def wait(driver, locator, timeout=20):
	try:
	logger.info('Waiting for locator {}'.format(locator))
	return WebDriverWait(driver, timeout).until(
	expected_conditions.element_to_be_clickable(locator))
	except TimeoutException:
	raise SpiderCannotGetDownloadURLError(
	'Could not get locator {}'.format(locator))


	class ColombiaSpider(CountrySpider):
	URL = 'http://www.irc.gov.co/'

	LINK_REGEX = '{year}\s+Profile([\s-]*)({month}\|{spanish_month})'
	YEAR_TABS_CLASS = 'xzb'
	FILE_LINK_CLASS = 'documento_titulo_vinculo'
	HISTORICAL_BEFORE_YEAR = 2011

	file_extension = 'pdf'
	country_name = 'Colombia'

	def get_download_url(self, date):
	if date.year < self.HISTORICAL_BEFORE_YEAR:
	tab_name = 'Historical'
	else:
	tab_name = str(date.year)

	page_source = self.get_file_list_page(tab_name, date.year)

	soup = BeautifulSoup(page_source, 'html.parser')
	file_link_re = re.compile(self.LINK_REGEX.format(
	year=date.year, month=date.strftime('%b'),
	spanish_month=SPANISH_MONTH_SPELLING[date.month - 1]))
	element = soup.find('a', class_=self.FILE_LINK_CLASS, text=file_link_re)
	if element is None:
	raise SpiderCannotGetDownloadURLError('Could not find file link')
	return element.attrs['href']

	def get_file_list_page(self, tab_name, year):
	with self.selenium_driver() as driver:
	try:
	driver.get(self.URL)
	wrapper = SeleniumDriverWrapper(driver)
	wrapper.wait_link_text('English Version', CLICKABLE).click()
	wrapper.wait_link_text('Central Government Public Debt '
	'Information', CLICKABLE).click()
	wrapper.wait_link_text('Public Debt Statistic Information',
	CLICKABLE).click()
	wrapper.wait_link_text('Historical Profile', CLICKABLE).click()
	wrapper.wait_class_name(self.YEAR_TABS_CLASS, CLICKABLE)
	self.get_tab(driver, tab_name).click()
	wrapper.wait_partial_link_text('{} Profile'.format(year),
	CLICKABLE).click()
	return driver.page_source
	except TimeoutException as e:
	raise SpiderCannotGetDownloadURLError(e.msg)

	def get_tab(self, driver, year_tab_name):
	for tab in driver.find_elements_by_class_name(self.YEAR_TABS_CLASS):
	if tab.text == year_tab_name:
	return tab
	else:
	raise SpiderCannotGetDownloadURLError(
	'Could not find year tab name {}'.format(year_tab_name))