Skip to content

Instantly share code, notes, and snippets.

@MBAustin
Last active November 12, 2016 21:19
Show Gist options
  • Save MBAustin/161b900e47bca8bf7dbac7caaa511900 to your computer and use it in GitHub Desktop.
Save MBAustin/161b900e47bca8bf7dbac7caaa511900 to your computer and use it in GitHub Desktop.
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from bs4 import BeautifulSoup
qInstallMsgHandler(lambda *args: None)
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def retreive_tournament(schedule_url, id, name, year, location):
pass
def make_soup(in_url):
r = Render(in_url)
result = r.frame.toHtml()
return BeautifulSoup(result, 'html.parser')
def recursive_scrape(in_url, seen_urls):
super_round = in_url.split('schedule/')
super_round = None if len(super_round)<2 else super_round[1].strip('/')
if(super_round):
sub_round = in_url.split(super_round + '/')
sub_round = None if len(sub_round)<2 else sub_round[1].strip('/')
matches = []
soup = make_soup(in_url)
for link in soup.find_all('a', href=True):
href = link.get('href')
if 'schedule' in href and link not in seen_urls:
print('found schedule: ' + href)
seen_urls.append(link)
recursive_scrape(href, seen_urls)
elif 'matches' in href:
match_soup = make_soup(href)
for link in match_soup.find_all('a', href=True):
print('found match: ' + link.get('href'))
if 'matchhistory' in link.get('href'):
matches.append({'path': link.get('href').split('details/')[1], 'super_round': super_round, 'sub_round': sub_round})
return matches
for match in recursive_scrape('http://www.lolesports.com/en_US/msi/msi_2016/schedule/default', []):
print(match)
for match in make_soup('http://www.lolesports.com/en_US/msi/msi_2016/schedule/default').find_all('a', href=True):
print(match.get('href'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment