Created
June 12, 2017 11:27
-
-
Save pawelpolewicz/8d41bcc4bf3d8c5992a613ffba9744a7 to your computer and use it in GitHub Desktop.
Colombia spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import re | |
from bs4 import BeautifulSoup | |
from selenium.common.exceptions import TimeoutException | |
from selenium.webdriver.support import expected_conditions | |
from selenium.webdriver.support.wait import WebDriverWait | |
from trounceflow.spiders.exceptions import SpiderCannotGetDownloadURLError | |
from trounceflow.spiders.impl.countries.base import CountrySpider | |
from trounceflow.utils.selenium import SeleniumDriverWrapper | |
logger = logging.getLogger(__name__) | |
CLICKABLE = expected_conditions.element_to_be_clickable | |
SPANISH_MONTH_SPELLING = ('Ene', 'Feb', 'Mar', 'Abr', 'May', 'Jun', 'Jul', | |
'Ago', 'Sep', 'Oct', 'Nov', 'Dic') | |
def wait(driver, locator, timeout=20): | |
try: | |
logger.info('Waiting for locator {}'.format(locator)) | |
return WebDriverWait(driver, timeout).until( | |
expected_conditions.element_to_be_clickable(locator)) | |
except TimeoutException: | |
raise SpiderCannotGetDownloadURLError( | |
'Could not get locator {}'.format(locator)) | |
class ColombiaSpider(CountrySpider): | |
URL = 'http://www.irc.gov.co/' | |
LINK_REGEX = '{year}\s+Profile([\s-]*)({month}|{spanish_month})' | |
YEAR_TABS_CLASS = 'xzb' | |
FILE_LINK_CLASS = 'documento_titulo_vinculo' | |
HISTORICAL_BEFORE_YEAR = 2011 | |
file_extension = 'pdf' | |
country_name = 'Colombia' | |
def get_download_url(self, date): | |
if date.year < self.HISTORICAL_BEFORE_YEAR: | |
tab_name = 'Historical' | |
else: | |
tab_name = str(date.year) | |
page_source = self.get_file_list_page(tab_name, date.year) | |
soup = BeautifulSoup(page_source, 'html.parser') | |
file_link_re = re.compile(self.LINK_REGEX.format( | |
year=date.year, month=date.strftime('%b'), | |
spanish_month=SPANISH_MONTH_SPELLING[date.month - 1])) | |
element = soup.find('a', class_=self.FILE_LINK_CLASS, text=file_link_re) | |
if element is None: | |
raise SpiderCannotGetDownloadURLError('Could not find file link') | |
return element.attrs['href'] | |
def get_file_list_page(self, tab_name, year): | |
with self.selenium_driver() as driver: | |
try: | |
driver.get(self.URL) | |
wrapper = SeleniumDriverWrapper(driver) | |
wrapper.wait_link_text('English Version', CLICKABLE).click() | |
wrapper.wait_link_text('Central Government Public Debt ' | |
'Information', CLICKABLE).click() | |
wrapper.wait_link_text('Public Debt Statistic Information', | |
CLICKABLE).click() | |
wrapper.wait_link_text('Historical Profile', CLICKABLE).click() | |
wrapper.wait_class_name(self.YEAR_TABS_CLASS, CLICKABLE) | |
self.get_tab(driver, tab_name).click() | |
wrapper.wait_partial_link_text('{} Profile'.format(year), | |
CLICKABLE).click() | |
return driver.page_source | |
except TimeoutException as e: | |
raise SpiderCannotGetDownloadURLError(e.msg) | |
def get_tab(self, driver, year_tab_name): | |
for tab in driver.find_elements_by_class_name(self.YEAR_TABS_CLASS): | |
if tab.text == year_tab_name: | |
return tab | |
else: | |
raise SpiderCannotGetDownloadURLError( | |
'Could not find year tab name {}'.format(year_tab_name)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment