Skip to content

Instantly share code, notes, and snippets.

@LuizArmesto
Created January 30, 2015 01:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LuizArmesto/7c699b9ebaa59e5e4c13 to your computer and use it in GitHub Desktop.
Save LuizArmesto/7c699b9ebaa59e5e4c13 to your computer and use it in GitHub Desktop.
from datetime import date, timedelta
import os, os.path
import sys
import requests
import lxml.html
class DateIterator(object):
def __init__(self, start_date, end_date):
self.start_date = start_date
self.end_date = end_date
self.current = self.start_date
def __iter__(self):
return self
def __next__(self):
return self.next()
def next(self):
if self.current > self.end_date:
raise StopIteration
else:
ret = self.current
self.current += timedelta(1)
return ret
class SabespScraper(object):
page_url = 'http://www2.sabesp.com.br/mananciais/divulgacaopcj.aspx'
image_url = 'http://www2.sabesp.com.br/mananciais/LeImgDadosSist.aspx?id=imgSistCant{session_id}'
images_path = 'images'
def __init__(self, start_date=None, end_date=None):
self.fails = []
self._init_date_iterator(start_date, end_date)
self._init_session()
self._create_directory(self.images_path)
def _init_date_iterator(self, start_date, end_date):
default_start_date = date(2015, 1, 20)
default_end_date = date.today()
self.date_iterator = DateIterator(start_date or default_start_date,
end_date or default_end_date)
def _init_session(self):
self._session = requests.Session()
response = self._session.get(self.page_url)
self._get_session_info(response)
self._session.headers['Referer'] = self.page_url
def _get_session_info(self, response):
# get session id from cookie
cookies = requests.utils.dict_from_cookiejar(response.cookies)
if 'ASP.NET_SessionId' in cookies:
self._session_id = cookies['ASP.NET_SessionId']
# get viewstate and eventvalidation hashes
html = lxml.html.fromstring(response.text)
form = html.forms[0]
self._viewstate = form.fields['__VIEWSTATE']
self._eventvalidation = form.fields['__EVENTVALIDATION']
def _save_image(self, path, date):
try:
image_content = self._get_image(date)
filename = '{path}/sabesp_{year}_{month}_{day}.jpg'.format(
path=path, year=date.year, month=date.month, day=date.day)
with open(filename, 'wb') as image:
image.write(image_content)
except requests.exceptions.HTTPError:
self.fails.append(date)
def _create_directory(self, directory):
if not os.path.exists(directory):
os.makedirs(directory)
def _visit_page(self, date):
data = {
'__VIEWSTATE': self._viewstate,
'__EVENTVALIDATION': self._eventvalidation,
'cmbDia': date.day,
'cmbMes': date.month,
'cmbAno': date.year,
'botSalvarHD.x': 5,
'botSalvarHD.y': 5
}
response = self._session.post(self.page_url, data)
self._get_session_info(response)
def _get_image(self, date):
self._visit_page(date)
image_url = self.image_url.format(session_id=self._session_id)
response = self._session.get(image_url)
if response.status_code is 200:
sys.stdout.write('.')
else:
sys.stdout.write('F')
response.raise_for_status()
return response.content
def start(self):
def iterate(iterator):
for date in iterator:
if not self.running:
break
self._save_image(self.images_path, date)
sys.stdout.flush()
self.running = True
print('Getting images...')
iterate(self.date_iterator)
if self.fails:
print('\n\nRetrying failed requests...')
retries = self.fails
self.fails = []
iterate(retries)
print('\n\nDone.')
if self.fails:
print('Failed:', ', '.join((d.isoformat() for d in self.fails)))
def stop(self):
print(' Stopping.')
self.running = False
if __name__ == '__main__':
scraper = SabespScraper()
scraper.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment