Last active
June 5, 2021 12:41
-
-
Save MBAustin/3464d370600f1b6216c71d72bd41554c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, sys, time | |
from PyQt4.QtGui import * | |
from PyQt4.QtCore import * | |
from PyQt4.QtWebKit import * | |
from bs4 import BeautifulSoup | |
class Render(QWebPage): | |
def __init__(self, url): | |
QWebPage.__init__(self) | |
self.url = url | |
self.finished = False | |
self.loadFinished.connect(self._loadFinished) | |
self.mainFrame().load(QUrl(url)) | |
def _loadFinished(self, result): | |
self.frame = self.mainFrame() | |
self.finished = True | |
debug = True | |
qInstallMsgHandler(lambda *args: None) | |
run_num = 1 | |
def scrape_tournament(url, r_depth, seen_urls, app = None): | |
if debug: | |
global run_num | |
print('run ' + str(run_num) + ' at depth ' + str(r_depth)) | |
run_num += 1 | |
print('visiting ' + url) | |
tournament_name = url.split('/')[5] | |
matches = set() | |
if not tournament_name in url: | |
matches.add('null url') | |
return matches | |
if app is None: | |
app = QApplication(sys.argv) | |
# app.exec_() | |
r = Render(url) | |
while not r.finished: | |
app.processEvents() | |
time.sleep(0.01) | |
soup = BeautifulSoup(r.frame.toHtml(), 'html.parser') | |
links = soup.find_all('a') | |
hrefs = [] | |
for link in links: | |
hrefs.append(link.get('href')) | |
#three distinct for loops avoid unnecessary recursion | |
for href in hrefs: | |
if href and 'match-details' in href and href not in url and href not in seen_urls: | |
seen_urls.append(href) | |
if debug: print('found match details for ' + url.split('/')[7] + ': ' + url.split('/')[8]) | |
#matches.add(href) | |
matches.add(href) | |
for href in hrefs: | |
if href and 'matches/' in href and href not in url and href not in seen_urls: | |
seen_urls.append(href) | |
if debug: print('found a series or match: ' + href) | |
#matches.add(href) | |
recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, r_depth + 1, seen_urls, app = app) | |
while len(recursive_scrape) == 0: | |
time.sleep(0.01) | |
print("recursive scrape: " + str(recursive_scrape)) | |
matches.update(recursive_scrape) | |
for href in hrefs: | |
if href and 'schedule/' in href and href not in url and href not in seen_urls: | |
seen_urls.append(href) | |
recursive_scrape = scrape_tournament('http://www.lolesports.com' + href, r_depth + 1, seen_urls, app = app) | |
while len(recursive_scrape) == 0: | |
time.sleep(0.01) | |
print("recursive scrape: " + str(recursive_scrape)) | |
matches.update(recursive_scrape) | |
if(len(matches) == 0): | |
matches.add('null url') | |
return matches | |
if __name__ == '__main__': | |
app = QApplication(sys.argv) | |
match_list = scrape_tournament("http://www.lolesports.com/en_US/msi/msi_2016/schedule/groups/1", 0, [], app = app) | |
for match in match_list: | |
print(match) | |
print('there were ' + str(len(match_list)) + ' in total') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment