Created
January 30, 2015 01:46
-
-
Save LuizArmesto/7c699b9ebaa59e5e4c13 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import date, timedelta | |
import os, os.path | |
import sys | |
import requests | |
import lxml.html | |
class DateIterator(object): | |
def __init__(self, start_date, end_date): | |
self.start_date = start_date | |
self.end_date = end_date | |
self.current = self.start_date | |
def __iter__(self): | |
return self | |
def __next__(self): | |
return self.next() | |
def next(self): | |
if self.current > self.end_date: | |
raise StopIteration | |
else: | |
ret = self.current | |
self.current += timedelta(1) | |
return ret | |
class SabespScraper(object): | |
page_url = 'http://www2.sabesp.com.br/mananciais/divulgacaopcj.aspx' | |
image_url = 'http://www2.sabesp.com.br/mananciais/LeImgDadosSist.aspx?id=imgSistCant{session_id}' | |
images_path = 'images' | |
def __init__(self, start_date=None, end_date=None): | |
self.fails = [] | |
self._init_date_iterator(start_date, end_date) | |
self._init_session() | |
self._create_directory(self.images_path) | |
def _init_date_iterator(self, start_date, end_date): | |
default_start_date = date(2015, 1, 20) | |
default_end_date = date.today() | |
self.date_iterator = DateIterator(start_date or default_start_date, | |
end_date or default_end_date) | |
def _init_session(self): | |
self._session = requests.Session() | |
response = self._session.get(self.page_url) | |
self._get_session_info(response) | |
self._session.headers['Referer'] = self.page_url | |
def _get_session_info(self, response): | |
# get session id from cookie | |
cookies = requests.utils.dict_from_cookiejar(response.cookies) | |
if 'ASP.NET_SessionId' in cookies: | |
self._session_id = cookies['ASP.NET_SessionId'] | |
# get viewstate and eventvalidation hashes | |
html = lxml.html.fromstring(response.text) | |
form = html.forms[0] | |
self._viewstate = form.fields['__VIEWSTATE'] | |
self._eventvalidation = form.fields['__EVENTVALIDATION'] | |
def _save_image(self, path, date): | |
try: | |
image_content = self._get_image(date) | |
filename = '{path}/sabesp_{year}_{month}_{day}.jpg'.format( | |
path=path, year=date.year, month=date.month, day=date.day) | |
with open(filename, 'wb') as image: | |
image.write(image_content) | |
except requests.exceptions.HTTPError: | |
self.fails.append(date) | |
def _create_directory(self, directory): | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
def _visit_page(self, date): | |
data = { | |
'__VIEWSTATE': self._viewstate, | |
'__EVENTVALIDATION': self._eventvalidation, | |
'cmbDia': date.day, | |
'cmbMes': date.month, | |
'cmbAno': date.year, | |
'botSalvarHD.x': 5, | |
'botSalvarHD.y': 5 | |
} | |
response = self._session.post(self.page_url, data) | |
self._get_session_info(response) | |
def _get_image(self, date): | |
self._visit_page(date) | |
image_url = self.image_url.format(session_id=self._session_id) | |
response = self._session.get(image_url) | |
if response.status_code is 200: | |
sys.stdout.write('.') | |
else: | |
sys.stdout.write('F') | |
response.raise_for_status() | |
return response.content | |
def start(self): | |
def iterate(iterator): | |
for date in iterator: | |
if not self.running: | |
break | |
self._save_image(self.images_path, date) | |
sys.stdout.flush() | |
self.running = True | |
print('Getting images...') | |
iterate(self.date_iterator) | |
if self.fails: | |
print('\n\nRetrying failed requests...') | |
retries = self.fails | |
self.fails = [] | |
iterate(retries) | |
print('\n\nDone.') | |
if self.fails: | |
print('Failed:', ', '.join((d.isoformat() for d in self.fails))) | |
def stop(self): | |
print(' Stopping.') | |
self.running = False | |
if __name__ == '__main__': | |
scraper = SabespScraper() | |
scraper.start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment