Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save nxrunning/c66139ef0dbea01a565e918959622de3 to your computer and use it in GitHub Desktop.
Save nxrunning/c66139ef0dbea01a565e918959622de3 to your computer and use it in GitHub Desktop.
Web scraping Goodreads: Popular running books
#### Set up ####
# Import required libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
# Specify the url
base_site = "https://www.goodreads.com/shelf/show/running"
# Make http request
response = requests.get(base_site)
# Check if request is successful. Status code of 200 indicates a successful attempt.
response.status_code
# Get the html from webpage
html = response.content
# Creating a BeautifulSoup object with the use of a parser
soup = BeautifulSoup(html, "lxml")
# Exporting html file
with open('popularrunningbooks.html', 'wb') as file:
file.write(soup.prettify('utf-8'))
#### Extracting the url addresses of each book link ####
# First layer: The element that contains all the data
divs = soup.find_all("div", {"class": "elementList"})
# Second layer: Extracting html tags that contain the links
links = [div.find('a') for div in divs]
# Extracting the partial links
relative_url = [link['href'] for link in links]
# Computing the full url addresses
full_url = [urljoin(base_site, relativeurl) for relativeurl in relative_url]
# Filter only the book links
book_url = [url for url in full_url if "https://www.goodreads.com/book/show" in url]
#### Scraping information of each book using for loop ####
book_description = []
book_author = []
book_title = []
book_rating = []
book_pages = []
#creating a loop counter
i = 0
#Loop through all 50 books
for url in book_url:
#connect to url page
note_resp = requests.get(url)
#checking if the request is successful
if note_resp.status_code == 200:
print("URL{}: {}".format(i+1, url))
else:
print('Status code{}: Skipping URL #{}: {}'.format(note_resp.status_code, i+1, url))
i = i+1
continue
#get HTML from url page
note_html = note_resp.content
#create beautifulsoup object for url page
note_soup = BeautifulSoup(note_html, 'html.parser')
#Extract Author particulars
author_divs = note_soup.find_all("div, {"class":"authorName__container"})
author_text = author_divs[0].find_all('a')[0].text
book_author.append(author_text)
#Extract title particulars
title_divs = note_soup.find_all("div", {"class": "last col"})
title_text = title_divs[0].find_all('h1')[0].text
book_title.append(title_text)
#Extract rating particulars
rating_divs = note_soup.find_all("div", {"class": "uitext stacked", "id": "bookMeta"})
rating_text = rating_divs[0].find_all("span", {"itemprop": "ratingValue"})[0].text
book_rating.append(rating_text)
#Extracting page particulars
page_divs = note_soup.find_all("div", {"class": "row"})
try:
page_text = page_divs[0].find_all("span", {"itemprop": "numberOfPages"})[0].text.strip(' pages')
except IndexError:
page_text = 0
book_pages.append(page_text)
#Extracting description particulars
description_divs = note_soup.find_all("div", {"class": "readable stacked", "id": "description"})
try:
description_text = description_divs[0].find_all("span")[1].text
except IndexError:
try:
description_text = description_divs[0].find_all("span")[0].text
except IndexError:
description_text = "Nil"
book_description.append(description_text)
#Incremeting the loop counter
i = i+1
#### Some simple data processing ####
revised_book_title = [book.strip() for book in book_title]
revised_book_rating = [float(rating.strip()) for rating in book_rating]
revised_book_pages = [int(page) for page in book_pages]
revised_book_description = [description.strip() for description in book_description]
#### Organising the data into a dataframe ####
book_df = pd.DataFrame()
book_df["Book Title"] = revised_book_title
book_df["Author"] = book_author
book_df["Rating"] = revised_book_rating
book_df["Pages"] = revised_book_pages
book_df["Description"] = revised_book_description
book_df["Links"] = book_url
#Preview dataframe
book_df.head()
# Sorting the dataframe based on ratings
sorted_book_df = book_df.sort_values(by='Rating', ascending = False)
sorted_book_df.reset_index(drop=True, inplace = True)
# Export dataframe
sorted_book_df.to_csv("top running books.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment