Skip to content

Instantly share code, notes, and snippets.

@realhackcraft
Created January 5, 2023 19:21
Show Gist options
  • Save realhackcraft/8b9a301e6beb5491e8a63ba9bcdd53d8 to your computer and use it in GitHub Desktop.
Save realhackcraft/8b9a301e6beb5491e8a63ba9bcdd53d8 to your computer and use it in GitHub Desktop.
Python webscraper using ChatGPT
import os
import csv
import inspect
import random
import requests
from colorama import Fore
from bs4 import BeautifulSoup
data = []
base_url = 'http://books.toscrape.com'
def scrap_books():
# Set base URL and page number
page_number = 1
# Set flag to indicate scraping is not finished
scraping = True
while scraping:
# Make request and get HTML
r = requests.get(base_url + "/catalogue/page-{}.html".format(page_number))
html = r.text
# Create BeautifulSoup object
soup = BeautifulSoup(html, 'html.parser')
# Find all books
books = soup.find_all('article', class_='product_pod')
# Scrape data for each book
for book in books:
title = book.h3.a['title']
price = book.find('div', class_='product_price').p.text
price = price.replace('Â', '') # Remove 'Â' character from price
link = base_url + book.h3.a['href']
# Append data for this book to list
data.append((title, price, link))
# Check if next page exists
next_button = soup.find('li', class_='next')
if not next_button:
scraping = False
else:
page_number += 1
# Write data to CSV
with open('books.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(('title', 'price', 'link')) # Add header row
writer.writerows(data)
# Print success message
num_books = len(data)
message = f"Finished scraping {base_url} and found {num_books} books. "
print(rainbow_text(message) + print_link(os.path.abspath('books.csv')))
def rainbow_text(text):
rainbow = [Fore.RED, Fore.YELLOW, Fore.GREEN, Fore.CYAN, Fore.BLUE, Fore.MAGENTA]
output = random.choice(rainbow) + text
return output
def print_link(file=None, line=None):
""" Print a link in PyCharm to a line in file.
Defaults to line where this function was called. """
if file is None:
file = inspect.stack()[1].filename
if line is None:
line = inspect.stack()[1].lineno
string = f'File "{file}", line {max(line, 1)}'.replace("\\", "/")
return string
if __name__ == "__main__":
scrap_books()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment