Skip to content

Instantly share code, notes, and snippets.

@simkusr
Created May 9, 2018 18:39
Show Gist options
  • Save simkusr/89c732831b59c0569b35c062598da3f8 to your computer and use it in GitHub Desktop.
Save simkusr/89c732831b59c0569b35c062598da3f8 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import datetime
import time
class CodingBatExDe:
"""
This crawler takes 2 params.
:param Url - http://codingbat.com/ after the slash it
can be either Python or Java.
:param domain => codingbat.com
And it will return dictionary containing all the battles within them.
"""
def __init__(self):
self.start_time = float()
self.total_time = float()
def main(self, url, domain):
self.start_time = time.time()
description = self.get_battle_desc(url, domain)
self.total_time = time.time() - self.start_time
sec = self.total_time
print('Total time --->', str(datetime.timedelta(seconds=sec)))
print('For url --->', url)
return description
@staticmethod
def get_html(url):
response = requests.get(url)
html = BeautifulSoup(response.text, 'lxml')
return html
def gather_battle_links(self, url, domain):
html = self.get_html(url)
battle_links = set()
for battle_link in html.find_all('div', {'class': 'summ'}):
for child in battle_link.descendants:
try:
if 'href' in child.attrs:
battle_links.add(
'http://' + domain + child.attrs['href'])
except AttributeError:
pass
return battle_links
def get_battles(self, url, domain):
battle_links = self.gather_battle_links(url, domain)
battles = {}
for btl_link in battle_links:
time.sleep(1)
battle_name = btl_link.split('/')[4]
html = self.get_html(btl_link)
for link in html.find_all('a'):
try:
if 'href' in link.attrs:
if '/prob/' in link['href']:
full_link = 'http://' + domain + link.attrs['href']
if battle_name not in battles:
battles[battle_name] = [full_link]
else:
battles[battle_name].append(full_link)
except AttributeError:
pass
return battles
def get_battle_desc(self, url, domain):
battles = self.get_battles(url, domain)
battle_data = {}
for k, v in battles.items():
i = 0
while i < len(v):
time.sleep(1) # not to overload server
html = self.get_html(v[i])
for battle_expl in html.find_all('p', {'class': 'max2'}):
battle_desc = battle_expl.text
for td in html.find_all('td', {'valign': 'top'}):
for br in td.find_all('br'):
brs = br.next_sibling
if 'Shorter output' not in brs:
if battle_desc not in battle_data:
battle_data[battle_desc] = [brs]
else:
battle_data[battle_desc].append(brs)
i += 1
return battle_data
# starting_url_python = r'http://codingbat.com/python'
# starting_url_java = r'http://codingbat.com/java'
# main_domain = starting_url_python.split('/')[2]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment