Skip to content

Instantly share code, notes, and snippets.

@Macuyiko
Last active January 6, 2016 19:52
Show Gist options
  • Save Macuyiko/83c5e56263eb6601cd69 to your computer and use it in GitHub Desktop.
Save Macuyiko/83c5e56263eb6601cd69 to your computer and use it in GitHub Desktop.
Simple scraper proof of concept for Morningstar ETFs
import requests
from bs4 import BeautifulSoup
import re
def get_viewstate(soup):
mainform = soup.find('form', {"action":"/uk/etfquickrank/default.aspx"})
return {
'__VIEWSTATE': mainform.find("input", {"name":"__VIEWSTATE"})['value'],
'__EVENTVALIDATION': mainform.find("input", {"name":"__EVENTVALIDATION"})['value']
}
def print_table(soup):
table = soup.find('table', {'id':'ctl00_ContentPlaceHolder1_aFundQuickrankControl_gridResult'})
cols_to_fetch = ['gridFundName', 'gridCategoryName', 'gridStarRating', 'gridYTD', 'gridOngoingCharge', 'gridClosePrice', 'gridClosePriceCurrency']
for row in table.findAll('tr', {'class':'gridItem'}):
for col in cols_to_fetch:
print(row.find('td',{'class': col}).getText().encode("utf-8"), end="\t")
print()
# GET the first page to fetch cookie and initial viewstate
r = requests.get('http://tools.morningstar.co.uk/uk/etfquickrank/default.aspx')
soup = BeautifulSoup(r.text, "html.parser")
cookies = r.cookies.get_dict()
viewstate = get_viewstate(soup)
# Prepare our post data
post_data = {
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'ctl00_ContentPlaceHolder1_aFundQuickrankControl_scrtmgrFundQuickrank_HiddenField': '',
'__LASTFOCUS': '',
'__VIEWSTATE': viewstate["__VIEWSTATE"],
'__VIEWSTATEENCRYPTED': '',
'__EVENTVALIDATION': viewstate["__EVENTVALIDATION"],
'ctl00$ContentPlaceHolder1$aFundQuickrankControl$ddlUniverse': 'ETEXG$XASX',
'ctl00$ContentPlaceHolder1$aFundQuickrankControl$ddlBrandingName': '',
'ctl00$ContentPlaceHolder1$aFundQuickrankControl$ddlCategory': '',
'ctl00$ContentPlaceHolder1$aFundQuickrankControl$txtSearchKey': '',
'ctl00$ContentPlaceHolder1$aFundQuickrankControl$hdnFilterBySelection': '',
'ctl00$ContentPlaceHolder1$aFundQuickrankControl$ddlPageSize': 100
}
# POST data and cookie
r = requests.post('http://tools.morningstar.co.uk/uk/etfquickrank/default.aspx', data=post_data, cookies=cookies)
soup = BeautifulSoup(r.text, "html.parser")
print_table(soup)
# But Zeppe, how do I get the next page of results?
next_link = soup.find('a', text = 'Next')
next_page = None
if next_link is not None:
page_re = re.search(",'(\d*)'", next_link['href'])
if page_re: next_page = page_re.group(1)
if next_page is not None:
print("Fetching page:",next_page)
# Make sure to send the latest viewstate, session id cookie stays the same
viewstate = get_viewstate(soup)
post_data['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$aFundQuickrankControl$AspNetPager'
post_data['__EVENTARGUMENT'] = next_page
post_data['__VIEWSTATE'] = viewstate["__VIEWSTATE"]
post_data['__EVENTVALIDATION'] = viewstate["__EVENTVALIDATION"]
r = requests.post('http://tools.morningstar.co.uk/uk/etfquickrank/default.aspx', data=post_data, cookies=cookies)
soup = BeautifulSoup(r.text, "html.parser")
print_table(soup)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment