simkusr/coding_battles.py

## coding_battles.py
from bs4 import BeautifulSoup
import requests
import datetime
import time


class CodingBatExDe:
    """
    This crawler takes 2 params.

    :param Url - http://codingbat.com/ after the slash it
    can be either Python or Java.
    :param domain => codingbat.com
    And it will return dictionary containing all the battles within them.
    """

    def __init__(self):
        self.start_time = float()
        self.total_time = float()

    def main(self, url, domain):
        self.start_time = time.time()
        description = self.get_battle_desc(url, domain)
        self.total_time = time.time() - self.start_time
        sec = self.total_time
        print('Total time --->', str(datetime.timedelta(seconds=sec)))
        print('For url --->', url)
        return description

    @staticmethod
    def get_html(url):
        response = requests.get(url)
        html = BeautifulSoup(response.text, 'lxml')
        return html

    def gather_battle_links(self, url, domain):
        html = self.get_html(url)
        battle_links = set()
        for battle_link in html.find_all('div', {'class': 'summ'}):
            for child in battle_link.descendants:
                try:
                    if 'href' in child.attrs:
                        battle_links.add(
                            'http://' + domain + child.attrs['href'])
                except AttributeError:
                    pass
        return battle_links

    def get_battles(self, url, domain):
        battle_links = self.gather_battle_links(url, domain)
        battles = {}
        for btl_link in battle_links:
            time.sleep(1)
            battle_name = btl_link.split('/')[4]
            html = self.get_html(btl_link)
            for link in html.find_all('a'):
                try:
                    if 'href' in link.attrs:
                        if '/prob/' in link['href']:
                            full_link = 'http://' + domain + link.attrs['href']
                            if battle_name not in battles:
                                battles[battle_name] = [full_link]
                            else:
                                battles[battle_name].append(full_link)
                except AttributeError:
                    pass
        return battles

    def get_battle_desc(self, url, domain):
        battles = self.get_battles(url, domain)
        battle_data = {}
        for k, v in battles.items():
            i = 0
            while i < len(v):
                time.sleep(1)  # not to overload server
                html = self.get_html(v[i])
                for battle_expl in html.find_all('p', {'class': 'max2'}):
                    battle_desc = battle_expl.text
                    for td in html.find_all('td', {'valign': 'top'}):
                        for br in td.find_all('br'):
                            brs = br.next_sibling
                            if 'Shorter output' not in brs:
                                if battle_desc not in battle_data:
                                    battle_data[battle_desc] = [brs]
                                else:
                                    battle_data[battle_desc].append(brs)
                i += 1
        return battle_data

# starting_url_python = r'http://codingbat.com/python'
# starting_url_java = r'http://codingbat.com/java'
# main_domain = starting_url_python.split('/')[2]
	from bs4 import BeautifulSoup
	import requests
	import datetime
	import time


	class CodingBatExDe:
	"""
	This crawler takes 2 params.

	:param Url - http://codingbat.com/ after the slash it
	can be either Python or Java.
	:param domain => codingbat.com
	And it will return dictionary containing all the battles within them.
	"""

	def __init__(self):
	self.start_time = float()
	self.total_time = float()

	def main(self, url, domain):
	self.start_time = time.time()
	description = self.get_battle_desc(url, domain)
	self.total_time = time.time() - self.start_time
	sec = self.total_time
	print('Total time --->', str(datetime.timedelta(seconds=sec)))
	print('For url --->', url)
	return description

	@staticmethod
	def get_html(url):
	response = requests.get(url)
	html = BeautifulSoup(response.text, 'lxml')
	return html

	def gather_battle_links(self, url, domain):
	html = self.get_html(url)
	battle_links = set()
	for battle_link in html.find_all('div', {'class': 'summ'}):
	for child in battle_link.descendants:
	try:
	if 'href' in child.attrs:
	battle_links.add(
	'http://' + domain + child.attrs['href'])
	except AttributeError:
	pass
	return battle_links

	def get_battles(self, url, domain):
	battle_links = self.gather_battle_links(url, domain)
	battles = {}
	for btl_link in battle_links:
	time.sleep(1)
	battle_name = btl_link.split('/')[4]
	html = self.get_html(btl_link)
	for link in html.find_all('a'):
	try:
	if 'href' in link.attrs:
	if '/prob/' in link['href']:
	full_link = 'http://' + domain + link.attrs['href']
	if battle_name not in battles:
	battles[battle_name] = [full_link]
	else:
	battles[battle_name].append(full_link)
	except AttributeError:
	pass
	return battles

	def get_battle_desc(self, url, domain):
	battles = self.get_battles(url, domain)
	battle_data = {}
	for k, v in battles.items():
	i = 0
	while i < len(v):
	time.sleep(1) # not to overload server
	html = self.get_html(v[i])
	for battle_expl in html.find_all('p', {'class': 'max2'}):
	battle_desc = battle_expl.text
	for td in html.find_all('td', {'valign': 'top'}):
	for br in td.find_all('br'):
	brs = br.next_sibling
	if 'Shorter output' not in brs:
	if battle_desc not in battle_data:
	battle_data[battle_desc] = [brs]
	else:
	battle_data[battle_desc].append(brs)
	i += 1
	return battle_data

	# starting_url_python = r'http://codingbat.com/python'
	# starting_url_java = r'http://codingbat.com/java'
	# main_domain = starting_url_python.split('/')[2]