Skip to content

Instantly share code, notes, and snippets.

@LuanP
Last active August 29, 2015 14:27
Show Gist options
  • Save LuanP/d592322207eee4f5fa87 to your computer and use it in GitHub Desktop.
Save LuanP/d592322207eee4f5fa87 to your computer and use it in GitHub Desktop.
simplificando web crawlers
# -*- coding: utf-8 -*-
import sys
import time
import urllib3
import urlparse
import requests
from bs4 import BeautifulSoup
urllib3.disable_warnings()
class Crawler(object):
def __init__(self, url):
self.url = url
parsed_url = urlparse.urlparse(url)
self.base_url = '{}://{}'.format(
parsed_url.scheme,
parsed_url.hostname
)
self.links = {}
self.crawled = []
self.not_crawled = []
super(Crawler, self).__init__()
def find_links(self, response):
soup = BeautifulSoup(response.text, 'html.parser')
a_tags = soup.find_all('a', href=True)
for a_tag in a_tags:
link = a_tag['href']
# add scheme if necessary or ignore a tag
if link.startswith('#') or 'javascript:' in link:
continue
elif link.startswith('/'):
link = self.base_url + link
if link in self.links:
self.links[link]['found'] += 1
elif self.base_url not in link:
continue
else:
self.links[link] = {'found': 1, 'errors': [], 'crawled': False}
self.not_crawled.append(link)
def crawl(self):
self.session = requests.Session()
response = self.session.get(self.url)
self.find_links(response)
while self.not_crawled:
self.process_page()
self.session.close()
def process_page(self):
current_url = self.not_crawled.pop(0)
print(current_url)
try:
new_response = self.session.get(current_url)
except requests.exceptions.RequestException as e:
self.links[current_url]['errors'].append({
'exc': e.message,
'url': current_url,
})
else:
self.find_links(new_response)
finally:
self.links[current_url]['crawled'] = True
self.crawled.append(current_url)
def run(self):
try:
self.crawl()
except KeyboardInterrupt:
pass
return self.links, self.crawled
if __name__ == '__main__':
if len(sys.argv) != 2:
print('provide a URL')
sys.exit()
url = sys.argv[1]
crawler = Crawler(url)
start_time = time.time()
links, crawled = crawler.run()
crawling_time = time.time() - start_time
print("\n{} URL's crawled in {}s".format(len(crawled), crawling_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment