Skip to content

Instantly share code, notes, and snippets.

@kashiftriffort
Created July 26, 2020 07:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kashiftriffort/a2e7ad06dca5acc38a76007465fe6226 to your computer and use it in GitHub Desktop.
Save kashiftriffort/a2e7ad06dca5acc38a76007465fe6226 to your computer and use it in GitHub Desktop.
Python BeautifulSoup Scraper that scrapes book covers, titles, descriptions, average rating, rating and authors from www.goodreads.com
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
url= "https://www.goodreads.com/shelf/show/thriller"
page = requests.get(url)
soup = bs(page.content, 'html.parser')
print(soup)
titles = soup.find_all('a', class_='bookTitle')
authors = soup.find_all('a', class_='authorName')
ratings = soup.find_all('span', attrs={'class':'greyText smallText'})
df = pd.DataFrame(columns=['title', 'description' ,'author', 'image', 'avg_rating', 'rating_count'])
for title, author, rating in zip(titles, authors, ratings):
book_page = requests.get("https://www.goodreads.com" + title["href"])
book_soup = bs(book_page.content, 'html.parser')
for item in book_soup.find_all(attrs={'data-text-id': True}):
bookID = item['data-text-id']
break
bookIDContainer = 'freeTextContainer'+bookID
desc = book_soup.find('span', id=bookIDContainer)
description = desc.get_text()
image = book_soup.find('img', id='coverImage')
image = image['src']
title = title.get_text()
title = re.sub("[\(\[].*?[\)\]]", "", title)
author = author.get_text()
avg_rating = re.search(r'avg rating ([\d.]+)', rating.text)
rating_count = re.search(r'([\d,]+) ratings', rating.text)
df2 = pd.DataFrame([[title, description, author, image, avg_rating.group(1), rating_count.group(1)]],columns=['title', 'description' ,'author', 'image', 'avg_rating', 'rating_count'])
df = df.append(df2)
df.to_csv('temp.csv', index=False, encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment