Skip to content

Instantly share code, notes, and snippets.

@MBAustin
Last active June 5, 2021 12:41
Show Gist options
  • Save MBAustin/3464d370600f1b6216c71d72bd41554c to your computer and use it in GitHub Desktop.
Save MBAustin/3464d370600f1b6216c71d72bd41554c to your computer and use it in GitHub Desktop.
import re, sys, time
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from bs4 import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.url = url
self.finished = False
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.finished = True
debug = True
qInstallMsgHandler(lambda *args: None)
run_num = 1
def scrape_tournament(url, r_depth, seen_urls, app = None):
if debug:
global run_num
print('run ' + str(run_num) + ' at depth ' + str(r_depth))
run_num += 1
print('visiting ' + url)
tournament_name = url.split('/')[5]
matches = set()
if not tournament_name in url:
matches.add('null url')
return matches
if app is None:
app = QApplication(sys.argv)
# app.exec_()
r = Render(url)
while not r.finished:
app.processEvents()
time.sleep(0.01)
soup = BeautifulSoup(r.frame.toHtml(), 'html.parser')
links = soup.find_all('a')
hrefs = []
for link in links:
hrefs.append(link.get('href'))
#three distinct for loops avoid unnecessary recursion
for href in hrefs:
if href and 'match-details' in href and href not in url and href not in seen_urls:
seen_urls.append(href)
if debug: print('found match details for ' + url.split('/')[7] + ': ' + url.split('/')[8])
#matches.add(href)
matches.add(href)
for href in hrefs:
if href and 'matches/' in href and href not in url and href not in seen_urls:
seen_urls.append(href)
if debug: print('found a series or match: ' + href)
#matches.add(href)
recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, r_depth + 1, seen_urls, app = app)
while len(recursive_scrape) == 0:
time.sleep(0.01)
print("recursive scrape: " + str(recursive_scrape))
matches.update(recursive_scrape)
for href in hrefs:
if href and 'schedule/' in href and href not in url and href not in seen_urls:
seen_urls.append(href)
recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, r_depth + 1, seen_urls, app = app)
while len(recursive_scrape) == 0:
time.sleep(0.01)
print("recursive scrape: " + str(recursive_scrape))
matches.update(recursive_scrape)
if(len(matches) == 0):
matches.add('null url')
return matches
if __name__ == '__main__':
app = QApplication(sys.argv)
match_list = scrape_tournament("http://www.lolesports.com/en_US/msi/msi_2016/schedule/groups/1", 0, [], app = app)
for match in match_list:
print(match)
print('there were ' + str(len(match_list)) + ' in total')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment