Skip to content

Instantly share code, notes, and snippets.

@tushar-rupani
Last active January 22, 2024 04:53
Show Gist options
  • Save tushar-rupani/1b1da3e9301ca8aee48257b964711f12 to your computer and use it in GitHub Desktop.
Save tushar-rupani/1b1da3e9301ca8aee48257b964711f12 to your computer and use it in GitHub Desktop.
This gist contains code for crawl through an entire website and get products. Tools usedd BeautifulSoup4 and Requests to get content.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def parse_content(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
product_containers = soup.find_all('a', {'class': 'plp-card-wrapper'})
if len(product_containers):
for container in product_containers:
product_url = "https://www.jiomart.com" + container.get("href")
product_page_response = requests.get(product_url).text
product_soup = BeautifulSoup(product_page_response, 'html.parser')
product_description = product_soup.find('div', {'id': 'pdp_description'})
print(product_description.find('div').get_text(strip=True))
else:
print("Ignored Page! Since it did not contain any product in it.")
class WebCrawler:
def __init__(self, target_url, depth=3):
self.start_url = target_url
self.depth = depth
self.visited_urls = set()
def crawl(self, url, current_depth):
if current_depth > self.depth or url in self.visited_urls:
return
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/91.0.4472.124 Safari/537.36 "
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
self.visited_urls.add(url)
parse_content(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
next_url = link['href']
if next_url and not next_url.startswith('#'):
next_url = urljoin(self.start_url, next_url)
self.crawl(next_url, current_depth + 1)
except Exception as e:
print(f"Error crawling {url}: {str(e)}")
if __name__ == "__main__":
start_url = "https://www.jiomart.com/"
crawler = WebCrawler(start_url)
crawler.crawl(start_url, current_depth=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment