tushar-rupani/web-crawler.py

## web-crawler.py
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin


def parse_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    product_containers = soup.find_all('a', {'class': 'plp-card-wrapper'})
    if len(product_containers):
        for container in product_containers:
            product_url = "https://www.jiomart.com" + container.get("href")
            product_page_response = requests.get(product_url).text
            product_soup = BeautifulSoup(product_page_response, 'html.parser')
            product_description = product_soup.find('div', {'id': 'pdp_description'})
            print(product_description.find('div').get_text(strip=True))
    else:
        print("Ignored Page! Since it did not contain any product in it.")

class WebCrawler:
    def __init__(self, target_url, depth=3):
        self.start_url = target_url
        self.depth = depth
        self.visited_urls = set()
    def crawl(self, url, current_depth):
        if current_depth > self.depth or url in self.visited_urls:
            return

        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                              "Chrome/91.0.4472.124 Safari/537.36 "
            }
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                self.visited_urls.add(url)
                parse_content(response.text)
                soup = BeautifulSoup(response.text, 'html.parser')
                links = soup.find_all('a', href=True)
                for link in links:
                    next_url = link['href']
                    if next_url and not next_url.startswith('#'):
                        next_url = urljoin(self.start_url, next_url)
                        self.crawl(next_url, current_depth + 1)
        except Exception as e:
            print(f"Error crawling {url}: {str(e)}")


if __name__ == "__main__":
    start_url = "https://www.jiomart.com/"
    crawler = WebCrawler(start_url)
    crawler.crawl(start_url, current_depth=1)
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin


	def parse_content(html_content):
	soup = BeautifulSoup(html_content, 'html.parser')
	product_containers = soup.find_all('a', {'class': 'plp-card-wrapper'})
	if len(product_containers):
	for container in product_containers:
	product_url = "https://www.jiomart.com" + container.get("href")
	product_page_response = requests.get(product_url).text
	product_soup = BeautifulSoup(product_page_response, 'html.parser')
	product_description = product_soup.find('div', {'id': 'pdp_description'})
	print(product_description.find('div').get_text(strip=True))
	else:
	print("Ignored Page! Since it did not contain any product in it.")

	class WebCrawler:
	def __init__(self, target_url, depth=3):
	self.start_url = target_url
	self.depth = depth
	self.visited_urls = set()
	def crawl(self, url, current_depth):
	if current_depth > self.depth or url in self.visited_urls:
	return

	try:
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/91.0.4472.124 Safari/537.36 "
	}
	response = requests.get(url, headers=headers)
	if response.status_code == 200:
	self.visited_urls.add(url)
	parse_content(response.text)
	soup = BeautifulSoup(response.text, 'html.parser')
	links = soup.find_all('a', href=True)
	for link in links:
	next_url = link['href']
	if next_url and not next_url.startswith('#'):
	next_url = urljoin(self.start_url, next_url)
	self.crawl(next_url, current_depth + 1)
	except Exception as e:
	print(f"Error crawling {url}: {str(e)}")


	if __name__ == "__main__":
	start_url = "https://www.jiomart.com/"
	crawler = WebCrawler(start_url)
	crawler.crawl(start_url, current_depth=1)