Skip to content

Instantly share code, notes, and snippets.

@VijayaMalla
Created February 10, 2020 21:18
Show Gist options
  • Save VijayaMalla/88c62b0d240bd3ace637a71d71e77f47 to your computer and use it in GitHub Desktop.
Save VijayaMalla/88c62b0d240bd3ace637a71d71e77f47 to your computer and use it in GitHub Desktop.
Scrape http://books.toscrape.com/ and get all categories and page links
{'Category': 'Travel', 'Link': 'http://books.toscrape.com/catalogue/category/books/travel_2/index.html'}
{'Category': 'Mystery', 'Link': 'http://books.toscrape.com/catalogue/category/books/mystery_3/index.html'}
{'Category': 'Historical Fiction', 'Link': 'http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html'}
{'Category': 'Sequential Art', 'Link': 'http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html'}
{'Category': 'Classics', 'Link': 'http://books.toscrape.com/catalogue/category/books/classics_6/index.html'}
{'Category': 'Philosophy', 'Link': 'http://books.toscrape.com/catalogue/category/books/philosophy_7/index.html'}
{'Category': 'Romance', 'Link': 'http://books.toscrape.com/catalogue/category/books/romance_8/index.html'}
{'Category': 'Womens Fiction', 'Link': 'http://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html'}
{'Category': 'Fiction', 'Link': 'http://books.toscrape.com/catalogue/category/books/fiction_10/index.html'}
{'Category': 'Childrens', 'Link': 'http://books.toscrape.com/catalogue/category/books/childrens_11/index.html'}
{'Category': 'Religion', 'Link': 'http://books.toscrape.com/catalogue/category/books/religion_12/index.html'}
{'Category': 'Nonfiction', 'Link': 'http://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html'}
{'Category': 'Music', 'Link': 'http://books.toscrape.com/catalogue/category/books/music_14/index.html'}
{'Category': 'Default', 'Link': 'http://books.toscrape.com/catalogue/category/books/default_15/index.html'}
{'Category': 'Science Fiction', 'Link': 'http://books.toscrape.com/catalogue/category/books/science-fiction_16/index.html'}
{'Category': 'Sports and Games', 'Link': 'http://books.toscrape.com/catalogue/category/books/sports-and-games_17/index.html'}
{'Category': 'Add a comment', 'Link': 'http://books.toscrape.com/catalogue/category/books/add-a-comment_18/index.html'}
{'Category': 'Fantasy', 'Link': 'http://books.toscrape.com/catalogue/category/books/fantasy_19/index.html'}
{'Category': 'New Adult', 'Link': 'http://books.toscrape.com/catalogue/category/books/new-adult_20/index.html'}
{'Category': 'Young Adult', 'Link': 'http://books.toscrape.com/catalogue/category/books/young-adult_21/index.html'}
{'Category': 'Science', 'Link': 'http://books.toscrape.com/catalogue/category/books/science_22/index.html'}
{'Category': 'Poetry', 'Link': 'http://books.toscrape.com/catalogue/category/books/poetry_23/index.html'}
{'Category': 'Paranormal', 'Link': 'http://books.toscrape.com/catalogue/category/books/paranormal_24/index.html'}
{'Category': 'Art', 'Link': 'http://books.toscrape.com/catalogue/category/books/art_25/index.html'}
{'Category': 'Psychology', 'Link': 'http://books.toscrape.com/catalogue/category/books/psychology_26/index.html'}
{'Category': 'Autobiography', 'Link': 'http://books.toscrape.com/catalogue/category/books/autobiography_27/index.html'}
{'Category': 'Parenting', 'Link': 'http://books.toscrape.com/catalogue/category/books/parenting_28/index.html'}
{'Category': 'Adult Fiction', 'Link': 'http://books.toscrape.com/catalogue/category/books/adult-fiction_29/index.html'}
{'Category': 'Humor', 'Link': 'http://books.toscrape.com/catalogue/category/books/humor_30/index.html'}
{'Category': 'Horror', 'Link': 'http://books.toscrape.com/catalogue/category/books/horror_31/index.html'}
{'Category': 'History', 'Link': 'http://books.toscrape.com/catalogue/category/books/history_32/index.html'}
{'Category': 'Food and Drink', 'Link': 'http://books.toscrape.com/catalogue/category/books/food-and-drink_33/index.html'}
{'Category': 'Christian Fiction', 'Link': 'http://books.toscrape.com/catalogue/category/books/christian-fiction_34/index.html'}
{'Category': 'Business', 'Link': 'http://books.toscrape.com/catalogue/category/books/business_35/index.html'}
{'Category': 'Biography', 'Link': 'http://books.toscrape.com/catalogue/category/books/biography_36/index.html'}
{'Category': 'Thriller', 'Link': 'http://books.toscrape.com/catalogue/category/books/thriller_37/index.html'}
{'Category': 'Contemporary', 'Link': 'http://books.toscrape.com/catalogue/category/books/contemporary_38/index.html'}
{'Category': 'Spirituality', 'Link': 'http://books.toscrape.com/catalogue/category/books/spirituality_39/index.html'}
{'Category': 'Academic', 'Link': 'http://books.toscrape.com/catalogue/category/books/academic_40/index.html'}
{'Category': 'Self Help', 'Link': 'http://books.toscrape.com/catalogue/category/books/self-help_41/index.html'}
{'Category': 'Historical', 'Link': 'http://books.toscrape.com/catalogue/category/books/historical_42/index.html'}
{'Category': 'Christian', 'Link': 'http://books.toscrape.com/catalogue/category/books/christian_43/index.html'}
{'Category': 'Suspense', 'Link': 'http://books.toscrape.com/catalogue/category/books/suspense_44/index.html'}
{'Category': 'Short Stories', 'Link': 'http://books.toscrape.com/catalogue/category/books/short-stories_45/index.html'}
{'Category': 'Novels', 'Link': 'http://books.toscrape.com/catalogue/category/books/novels_46/index.html'}
{'Category': 'Health', 'Link': 'http://books.toscrape.com/catalogue/category/books/health_47/index.html'}
{'Category': 'Politics', 'Link': 'http://books.toscrape.com/catalogue/category/books/politics_48/index.html'}
{'Category': 'Cultural', 'Link': 'http://books.toscrape.com/catalogue/category/books/cultural_49/index.html'}
{'Category': 'Erotica', 'Link': 'http://books.toscrape.com/catalogue/category/books/erotica_50/index.html'}
{'Category': 'Crime', 'Link': 'http://books.toscrape.com/catalogue/category/books/crime_51/index.html'}
# Import Packages
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tabulate import tabulate
# Download and Parse the HTML
start_url = 'http://books.toscrape.com/index.html'
# Download the HTML from start_url
downloaded_html = requests.get(start_url)
# Parse the HTML with BeautifulSoup and create a soup object
soup = BeautifulSoup(downloaded_html.text, "lxml")
full_list = soup.select('.side_categories ul li ul li')
regex = re.compile(r'\n[ ]*')
book_dict = [{}]
for element in full_list:
link_text = element.get_text()
link_text = regex.sub('', link_text)
anchor_tag = element.select('a')
fullbooklink = "http://books.toscrape.com/"+anchor_tag[0]['href']
if (len(link_text) > 0 or len(fullbooklink) > 0):
book_dict.append({'Category': link_text,
'Link': fullbooklink})
# Save a local copy
with open('BookCategoryList', 'w') as file:
for item in book_dict:
if (len(item) > 0):
file.write("%s\n" % item)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment