Skip to content

Instantly share code, notes, and snippets.

@iam-mhaseeb
Last active February 4, 2022 08:03
Show Gist options
  • Save iam-mhaseeb/d2dfb5eb20db187db59d4fa15aa57439 to your computer and use it in GitHub Desktop.
Save iam-mhaseeb/d2dfb5eb20db187db59d4fa15aa57439 to your computer and use it in GitHub Desktop.
import re
import logging
import requests
from bs4 import BeautifulSoup
from collections import deque
from urllib.parse import urlsplit
from urllib.parse import urlparse
LOGGER = logging.getLogger(__name__)
class BaseScraper:
def __init__(self, start_urls):
if not start_urls:
raise Exception('start_urls are required to start crawling...')
self.start_urls = deque(start_urls)
def crawl(self):
# process urls one by one until we exhaust the queue
while len(self.start_urls):
url = self.start_urls.popleft()
LOGGER.info('Processing url: {}'.format(url))
try:
response = requests.get(url)
except Exception as e:
LOGGER.error('Failed to process url: {} with following error: {}'.format(url, e))
broken_urls.add(url)
continue
self.parse(response)
def parse(self, response):
raise NotImplementedError('Implementation of parse function is required...')
def run(self):
LOGGER.info('Running scraper...')
self.crawl()
LOGGER.info('Finished running scraper!')
class CustomScraper(BaseScraper):
start_urls = ['https://scrapethissite.com']
processed_urls = set()
local_urls = set()
foreign_urls = set()
broken_urls = set()
def parse(self, response):
soup = BeautifulSoup(response.text, "lxml")
for link in soup.find_all('a'):
extracted_url = link.attrs["href"] if "href" in link.attrs else ''
if response.url in extracted_url:
self.local_urls.add(url)
else:
self.foreign_urls.add(url)
self.processed_urls.add(url)
if __name__ == "main":
scraper = CustomScraper()
scraper.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment