iam-mhaseeb/scraping_framework_in_python.py

## scraping_framework_in_python.py
import re
import logging
import requests
from bs4 import BeautifulSoup
from collections import deque
from urllib.parse import urlsplit
from urllib.parse import urlparse

LOGGER = logging.getLogger(__name__)


class BaseScraper:
    def __init__(self, start_urls):
        if not start_urls:
            raise Exception('start_urls are required to start crawling...')

        self.start_urls = deque(start_urls)

    def crawl(self):
        # process urls one by one until we exhaust the queue
        while len(self.start_urls):
            url = self.start_urls.popleft()
            LOGGER.info('Processing url: {}'.format(url))
            try:
                response = requests.get(url)
            except Exception as e:
                LOGGER.error('Failed to process url: {} with following error: {}'.format(url, e))
                broken_urls.add(url)
                continue

            self.parse(response)

    def parse(self, response):
        raise NotImplementedError('Implementation of parse function is required...')

    def run(self):
        LOGGER.info('Running scraper...')
        self.crawl()
        LOGGER.info('Finished running scraper!')


class CustomScraper(BaseScraper):
    start_urls = ['https://scrapethissite.com']
    processed_urls = set()
    local_urls = set()
    foreign_urls = set()
    broken_urls = set()

    def parse(self, response):
        soup = BeautifulSoup(response.text, "lxml")
        for link in soup.find_all('a'):
            extracted_url = link.attrs["href"] if "href" in link.attrs else ''
            if response.url in extracted_url:
                self.local_urls.add(url)
            else:
                self.foreign_urls.add(url)

            self.processed_urls.add(url)


if __name__ == "main":
    scraper = CustomScraper()
    scraper.run()
	import re
	import logging
	import requests
	from bs4 import BeautifulSoup
	from collections import deque
	from urllib.parse import urlsplit
	from urllib.parse import urlparse

	LOGGER = logging.getLogger(__name__)


	class BaseScraper:
	def __init__(self, start_urls):
	if not start_urls:
	raise Exception('start_urls are required to start crawling...')

	self.start_urls = deque(start_urls)

	def crawl(self):
	# process urls one by one until we exhaust the queue
	while len(self.start_urls):
	url = self.start_urls.popleft()
	LOGGER.info('Processing url: {}'.format(url))
	try:
	response = requests.get(url)
	except Exception as e:
	LOGGER.error('Failed to process url: {} with following error: {}'.format(url, e))
	broken_urls.add(url)
	continue

	self.parse(response)

	def parse(self, response):
	raise NotImplementedError('Implementation of parse function is required...')

	def run(self):
	LOGGER.info('Running scraper...')
	self.crawl()
	LOGGER.info('Finished running scraper!')


	class CustomScraper(BaseScraper):
	start_urls = ['https://scrapethissite.com']
	processed_urls = set()
	local_urls = set()
	foreign_urls = set()
	broken_urls = set()

	def parse(self, response):
	soup = BeautifulSoup(response.text, "lxml")
	for link in soup.find_all('a'):
	extracted_url = link.attrs["href"] if "href" in link.attrs else ''
	if response.url in extracted_url:
	self.local_urls.add(url)
	else:
	self.foreign_urls.add(url)

	self.processed_urls.add(url)


	if __name__ == "main":
	scraper = CustomScraper()
	scraper.run()