Created
August 21, 2017 18:45
-
-
Save jarek/d73c672d8dd4ddb48d80bffc4d8038ba to your computer and use it in GitHub Desktop.
SV parser beginnings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# for https://github.com/tmrowco/electricitymap/issues/678 | |
from bs4 import BeautifulSoup | |
from pprint import pprint | |
import requests | |
import json | |
url = 'http://estadistico.ut.com.sv/OperacionDiaria.aspx' | |
s = requests.Session() | |
pagereq = s.get(url) | |
soup = BeautifulSoup(pagereq.content, 'html.parser') | |
viewstategenerator = soup.find("input", attrs = {'id': '__VIEWSTATEGENERATOR'})['value'] | |
viewstate = soup.find("input", attrs = {'id': '__VIEWSTATE'})['value'] | |
eventvalidation = soup.find("input", attrs = {'id': '__EVENTVALIDATION'})['value'] | |
DXCss = '1_33,1_4,1_9,1_5,15_2,15_4' | |
DXScript = '1_232,1_134,1_225,1_169,1_187,15_1,1_183,1_182,1_140,1_147,1_148,1_142,1_141,1_143,1_144,1_145,1_146,15_0,15_6,15_7' | |
# this works, but only gives data for current day | |
callback_param_init = 'c0:{"Task":"Initialize","DashboardId":"OperacionDiaria","Settings":{"calculateHiddenTotals":false},"RequestMarker":0,"ClientState":{}}' | |
# This should work to get any date, but doesn't work with just any SessionId/Context, or with no SessionId and Context specified. | |
# It might work after you call to callback_param_init first and extract SessionId and Context from the response to that. | |
callback_param_specific_date = '__CALLBACKPARAM:c0:{"Task":"ReloadData","DashboardParameters":[{"Name":"FechaConsulta","Value":new Date(2017,7,20)}],"SessionId":"7d34ddcb-9f7c-43ae-899e-53c1f193ec37","Context":"BwAHAAIkYTk4OTllZDktNWE3MS00MGE5LWFkMDMtNWU1OWJhNzViMGU0Ag9PcGVyYWNpb25EaWFyaWECAAIAAAAAAMByQA==","RequestMarker":1,"ClientState":{}}' | |
callback_param_specific_date = '__CALLBACKPARAM:c0:{"Task":"ReloadData","DashboardParameters":[{"Name":"FechaConsulta","Value":new Date(2017,7,20)}],"RequestMarker":1,"ClientState":{}}' | |
postdata = {'__VIEWSTATE': viewstate, | |
'__VIEWSTATEGENERATOR': viewstategenerator, | |
'__EVENTVALIDATION': eventvalidation, | |
'__CALLBACKPARAM': callback_param_init, | |
'__CALLBACKID':'ASPxDashboardViewer1', | |
'DXScript': DXScript, | |
'DXCss': DXCss | |
} | |
datareq = s.post(url, data=postdata) | |
print(datareq.status_code) | |
with open('response.txt', 'wb') as f: | |
f.write(datareq.content) | |
""" | |
You might get encoding problems reading datareq.text in Python 2. It works fine in Python 3 | |
and I'm sure "the usual hacks" would solve it in Python 2... | |
""" | |
double_json = datareq.text[len('0|/*DX*/('):-1] | |
double_json = double_json.replace('\'', '"') | |
data = json.loads(double_json) | |
print(data.keys()) | |
print(data['result'][:1000]) | |
print(data['result'][-1000:]) | |
with open('response.js', 'w') as f: | |
f.write(data['result'][1:-1]) | |
""" | |
At this point, response.js contains a dict that approximates but is not quite JSON. | |
More specifically, it is Javascript. It looks like the "result" key was intended to be fed into JS eval(), | |
which could deal with single-quotes and process values like `new Date(2017,7,11,0,0,0,0)` as expected. | |
The easiest way to get some data would be to do string searches on data['result'] to find parts | |
that have information we want without containing explicit timestamps (`new Date(2017,7,11,0,0,0,0)...`). | |
Then cut out that bit and feed it into json.loads(). Search for "Hidroel" will get you started. | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment