Skip to content

Instantly share code, notes, and snippets.

@pawelpolewicz
Created June 12, 2017 11:27
Show Gist options
  • Save pawelpolewicz/8d41bcc4bf3d8c5992a613ffba9744a7 to your computer and use it in GitHub Desktop.
Save pawelpolewicz/8d41bcc4bf3d8c5992a613ffba9744a7 to your computer and use it in GitHub Desktop.
Colombia spider
import logging
import re
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from trounceflow.spiders.exceptions import SpiderCannotGetDownloadURLError
from trounceflow.spiders.impl.countries.base import CountrySpider
from trounceflow.utils.selenium import SeleniumDriverWrapper
logger = logging.getLogger(__name__)
CLICKABLE = expected_conditions.element_to_be_clickable
SPANISH_MONTH_SPELLING = ('Ene', 'Feb', 'Mar', 'Abr', 'May', 'Jun', 'Jul',
'Ago', 'Sep', 'Oct', 'Nov', 'Dic')
def wait(driver, locator, timeout=20):
try:
logger.info('Waiting for locator {}'.format(locator))
return WebDriverWait(driver, timeout).until(
expected_conditions.element_to_be_clickable(locator))
except TimeoutException:
raise SpiderCannotGetDownloadURLError(
'Could not get locator {}'.format(locator))
class ColombiaSpider(CountrySpider):
URL = 'http://www.irc.gov.co/'
LINK_REGEX = '{year}\s+Profile([\s-]*)({month}|{spanish_month})'
YEAR_TABS_CLASS = 'xzb'
FILE_LINK_CLASS = 'documento_titulo_vinculo'
HISTORICAL_BEFORE_YEAR = 2011
file_extension = 'pdf'
country_name = 'Colombia'
def get_download_url(self, date):
if date.year < self.HISTORICAL_BEFORE_YEAR:
tab_name = 'Historical'
else:
tab_name = str(date.year)
page_source = self.get_file_list_page(tab_name, date.year)
soup = BeautifulSoup(page_source, 'html.parser')
file_link_re = re.compile(self.LINK_REGEX.format(
year=date.year, month=date.strftime('%b'),
spanish_month=SPANISH_MONTH_SPELLING[date.month - 1]))
element = soup.find('a', class_=self.FILE_LINK_CLASS, text=file_link_re)
if element is None:
raise SpiderCannotGetDownloadURLError('Could not find file link')
return element.attrs['href']
def get_file_list_page(self, tab_name, year):
with self.selenium_driver() as driver:
try:
driver.get(self.URL)
wrapper = SeleniumDriverWrapper(driver)
wrapper.wait_link_text('English Version', CLICKABLE).click()
wrapper.wait_link_text('Central Government Public Debt '
'Information', CLICKABLE).click()
wrapper.wait_link_text('Public Debt Statistic Information',
CLICKABLE).click()
wrapper.wait_link_text('Historical Profile', CLICKABLE).click()
wrapper.wait_class_name(self.YEAR_TABS_CLASS, CLICKABLE)
self.get_tab(driver, tab_name).click()
wrapper.wait_partial_link_text('{} Profile'.format(year),
CLICKABLE).click()
return driver.page_source
except TimeoutException as e:
raise SpiderCannotGetDownloadURLError(e.msg)
def get_tab(self, driver, year_tab_name):
for tab in driver.find_elements_by_class_name(self.YEAR_TABS_CLASS):
if tab.text == year_tab_name:
return tab
else:
raise SpiderCannotGetDownloadURLError(
'Could not find year tab name {}'.format(year_tab_name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment