Created
May 9, 2018 18:39
-
-
Save simkusr/89c732831b59c0569b35c062598da3f8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import datetime | |
import time | |
class CodingBatExDe: | |
""" | |
This crawler takes 2 params. | |
:param Url - http://codingbat.com/ after the slash it | |
can be either Python or Java. | |
:param domain => codingbat.com | |
And it will return dictionary containing all the battles within them. | |
""" | |
def __init__(self): | |
self.start_time = float() | |
self.total_time = float() | |
def main(self, url, domain): | |
self.start_time = time.time() | |
description = self.get_battle_desc(url, domain) | |
self.total_time = time.time() - self.start_time | |
sec = self.total_time | |
print('Total time --->', str(datetime.timedelta(seconds=sec))) | |
print('For url --->', url) | |
return description | |
@staticmethod | |
def get_html(url): | |
response = requests.get(url) | |
html = BeautifulSoup(response.text, 'lxml') | |
return html | |
def gather_battle_links(self, url, domain): | |
html = self.get_html(url) | |
battle_links = set() | |
for battle_link in html.find_all('div', {'class': 'summ'}): | |
for child in battle_link.descendants: | |
try: | |
if 'href' in child.attrs: | |
battle_links.add( | |
'http://' + domain + child.attrs['href']) | |
except AttributeError: | |
pass | |
return battle_links | |
def get_battles(self, url, domain): | |
battle_links = self.gather_battle_links(url, domain) | |
battles = {} | |
for btl_link in battle_links: | |
time.sleep(1) | |
battle_name = btl_link.split('/')[4] | |
html = self.get_html(btl_link) | |
for link in html.find_all('a'): | |
try: | |
if 'href' in link.attrs: | |
if '/prob/' in link['href']: | |
full_link = 'http://' + domain + link.attrs['href'] | |
if battle_name not in battles: | |
battles[battle_name] = [full_link] | |
else: | |
battles[battle_name].append(full_link) | |
except AttributeError: | |
pass | |
return battles | |
def get_battle_desc(self, url, domain): | |
battles = self.get_battles(url, domain) | |
battle_data = {} | |
for k, v in battles.items(): | |
i = 0 | |
while i < len(v): | |
time.sleep(1) # not to overload server | |
html = self.get_html(v[i]) | |
for battle_expl in html.find_all('p', {'class': 'max2'}): | |
battle_desc = battle_expl.text | |
for td in html.find_all('td', {'valign': 'top'}): | |
for br in td.find_all('br'): | |
brs = br.next_sibling | |
if 'Shorter output' not in brs: | |
if battle_desc not in battle_data: | |
battle_data[battle_desc] = [brs] | |
else: | |
battle_data[battle_desc].append(brs) | |
i += 1 | |
return battle_data | |
# starting_url_python = r'http://codingbat.com/python' | |
# starting_url_java = r'http://codingbat.com/java' | |
# main_domain = starting_url_python.split('/')[2] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment