Last active
January 22, 2024 04:53
-
-
Save tushar-rupani/1b1da3e9301ca8aee48257b964711f12 to your computer and use it in GitHub Desktop.
This gist contains code for crawl through an entire website and get products. Tools usedd BeautifulSoup4 and Requests to get content.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
def parse_content(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
product_containers = soup.find_all('a', {'class': 'plp-card-wrapper'}) | |
if len(product_containers): | |
for container in product_containers: | |
product_url = "https://www.jiomart.com" + container.get("href") | |
product_page_response = requests.get(product_url).text | |
product_soup = BeautifulSoup(product_page_response, 'html.parser') | |
product_description = product_soup.find('div', {'id': 'pdp_description'}) | |
print(product_description.find('div').get_text(strip=True)) | |
else: | |
print("Ignored Page! Since it did not contain any product in it.") | |
class WebCrawler: | |
def __init__(self, target_url, depth=3): | |
self.start_url = target_url | |
self.depth = depth | |
self.visited_urls = set() | |
def crawl(self, url, current_depth): | |
if current_depth > self.depth or url in self.visited_urls: | |
return | |
try: | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/91.0.4472.124 Safari/537.36 " | |
} | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
self.visited_urls.add(url) | |
parse_content(response.text) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
links = soup.find_all('a', href=True) | |
for link in links: | |
next_url = link['href'] | |
if next_url and not next_url.startswith('#'): | |
next_url = urljoin(self.start_url, next_url) | |
self.crawl(next_url, current_depth + 1) | |
except Exception as e: | |
print(f"Error crawling {url}: {str(e)}") | |
if __name__ == "__main__": | |
start_url = "https://www.jiomart.com/" | |
crawler = WebCrawler(start_url) | |
crawler.crawl(start_url, current_depth=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment